From 446164d7b9c8197edb96a3471565813ed1361b6c Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Mon, 21 Oct 2024 12:37:39 -0700
Subject: [PATCH] Add Groq vision preview support (#2511)

Adds support for only the llama3.2 vision models on groq. This comes with many conditionals and nuances to handle as Groqs vision implemention is quite bad right now
---
 server/utils/AiProviders/groq/index.js | 108 +++++++++++++++++++++++--
 1 file changed, 103 insertions(+), 5 deletions(-)

diff --git a/server/utils/AiProviders/groq/index.js b/server/utils/AiProviders/groq/index.js
index c176f1dca..d928e5e0d 100644
--- a/server/utils/AiProviders/groq/index.js
+++ b/server/utils/AiProviders/groq/index.js
@@ -37,6 +37,10 @@ class GroqLLM {
     );
   }
 
+  #log(text, ...args) {
+    console.log(`\x1b[32m[GroqAi]\x1b[0m ${text}`, ...args);
+  }
+
   streamingEnabled() {
     return "streamGetChatCompletion" in this;
   }
@@ -53,17 +57,111 @@ class GroqLLM {
     return !!modelName; // name just needs to exist
   }
 
+  /**
+   * Generates appropriate content array for a message + attachments.
+   * @param {{userPrompt:string, attachments: import("../../helpers").Attachment[]}}
+   * @returns {string|object[]}
+   */
+  #generateContent({ userPrompt, attachments = [] }) {
+    if (!attachments.length) return userPrompt;
+
+    const content = [{ type: "text", text: userPrompt }];
+    for (let attachment of attachments) {
+      content.push({
+        type: "image_url",
+        image_url: {
+          url: attachment.contentString,
+        },
+      });
+    }
+    return content.flat();
+  }
+
+  /**
+   * Last Updated: October 21, 2024
+   * According to https://console.groq.com/docs/vision
+   * the vision models supported all make a mess of prompting depending on the model.
+   * Currently the llama3.2 models are only in preview and subject to change and the llava model is deprecated - so we will not support attachments for that at all.
+   *
+   * Since we can only explicitly support the current models, this is a temporary solution.
+   * If the attachments are empty or the model is not a vision model, we will return the default prompt structure which will work for all models.
+   * If the attachments are present and the model is a vision model - we only return the user prompt with attachments - see comment at end of function for more.
+   */
+  #conditionalPromptStruct({
+    systemPrompt = "",
+    contextTexts = [],
+    chatHistory = [],
+    userPrompt = "",
+    attachments = [], // This is the specific attachment for only this prompt
+  }) {
+    const VISION_MODELS = [
+      "llama-3.2-90b-vision-preview",
+      "llama-3.2-11b-vision-preview",
+    ];
+    const DEFAULT_PROMPT_STRUCT = [
+      {
+        role: "system",
+        content: `${systemPrompt}${this.#appendContext(contextTexts)}`,
+      },
+      ...chatHistory,
+      { role: "user", content: userPrompt },
+    ];
+
+    // If there are no attachments or model is not a vision model, return the default prompt structure
+    // as there is nothing to attach or do and no model limitations to consider
+    if (!attachments.length) return DEFAULT_PROMPT_STRUCT;
+    if (!VISION_MODELS.includes(this.model)) {
+      this.#log(
+        `${this.model} is not an explicitly supported vision model! Will omit attachments.`
+      );
+      return DEFAULT_PROMPT_STRUCT;
+    }
+
+    return [
+      // Why is the system prompt and history commented out?
+      // The current vision models for Groq perform VERY poorly with ANY history or text prior to the image.
+      // In order to not get LLM refusals for every single message, we will not include the "system prompt" or even the chat history.
+      // This is a temporary solution until Groq fixes their vision models to be more coherent and also handle context prior to the image.
+      // Note for the future:
+      // Groq vision models also do not support system prompts - which is why you see the user/assistant emulation used instead of "system".
+      // This means any vision call is assessed independently of the chat context prior to the image.
+      /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // {
+      //   role: "user",
+      //   content: `${systemPrompt}${this.#appendContext(contextTexts)}`,
+      // },
+      // {
+      //   role: "assistant",
+      //   content: "OK",
+      // },
+      // ...chatHistory,
+      {
+        role: "user",
+        content: this.#generateContent({ userPrompt, attachments }),
+      },
+    ];
+  }
+
+  /**
+   * Construct the user prompt for this model.
+   * @param {{attachments: import("../../helpers").Attachment[]}} param0
+   * @returns
+   */
   constructPrompt({
     systemPrompt = "",
     contextTexts = [],
     chatHistory = [],
     userPrompt = "",
+    attachments = [], // This is the specific attachment for only this prompt
   }) {
-    const prompt = {
-      role: "system",
-      content: `${systemPrompt}${this.#appendContext(contextTexts)}`,
-    };
-    return [prompt, ...chatHistory, { role: "user", content: userPrompt }];
+    // NOTICE: SEE GroqLLM.#conditionalPromptStruct for more information on how attachments are handled with Groq.
+    return this.#conditionalPromptStruct({
+      systemPrompt,
+      contextTexts,
+      chatHistory,
+      userPrompt,
+      attachments,
+    });
   }
 
   async getChatCompletion(messages = null, { temperature = 0.7 }) {