From ca45fce8ac49285872ae1a72ca04f880f0d027fb Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 13 Aug 2024 15:26:52 +0530
Subject: [PATCH 1/3] Break long links in train of thought to stay within chat
 page width

---
 src/interface/web/app/components/chatMessage/chatMessage.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/interface/web/app/components/chatMessage/chatMessage.tsx b/src/interface/web/app/components/chatMessage/chatMessage.tsx
index 2613ef47..98f74d5a 100644
--- a/src/interface/web/app/components/chatMessage/chatMessage.tsx
+++ b/src/interface/web/app/components/chatMessage/chatMessage.tsx
@@ -262,7 +262,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
     let markdownRendered = DOMPurify.sanitize(md.render(props.message));
     return (
         <div
-            className={`${styles.trainOfThoughtElement} items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
+            className={`${styles.trainOfThoughtElement} break-all items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
         >
             {icon}
             <div dangerouslySetInnerHTML={{ __html: markdownRendered }} />

From acdc3f947077caf6b9b8b0958269515c38d88daf Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 16 Aug 2024 05:20:24 -0500
Subject: [PATCH 2/3] Unwrap any json in md code block, when parsing chat actor
 responses

This is a more robust way to extract json output requested from
gemma-2 (2B, 9B) models which tend to return json in md codeblocks.

Other models should remain unaffected by this change.

Also removed request to not wrap json in codeblocks from prompts. As
code is doing the unwrapping automatically now, when present
---
 pyproject.toml                                        | 2 +-
 src/khoj/processor/conversation/offline/chat_model.py | 3 +++
 src/khoj/processor/conversation/prompts.py            | 2 +-
 src/khoj/routers/helpers.py                           | 6 ++++++
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4b651dad..edbbb655 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ dependencies = [
     "pymupdf >= 1.23.5",
     "django == 5.0.7",
     "authlib == 1.2.1",
-    "llama-cpp-python == 0.2.82",
+    "llama-cpp-python == 0.2.88",
     "itsdangerous == 2.1.2",
     "httpx == 0.25.0",
     "pgvector == 0.2.4",
diff --git a/src/khoj/processor/conversation/offline/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py
index ec4c7367..1251dcec 100644
--- a/src/khoj/processor/conversation/offline/chat_model.py
+++ b/src/khoj/processor/conversation/offline/chat_model.py
@@ -103,6 +103,9 @@ def extract_questions_offline(
             .replace("']", '"]')
             .replace("', '", '", "')
         )
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json"):
+            response = response[7:-3]
         questions: List[str] = json.loads(questions_str)
         questions = filter_questions(questions)
     except:
diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
index 6a8db9db..ffd7d094 100644
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -587,7 +587,7 @@ You are Khoj, an advanced google search assistant. You are tasked with construct
 - Official, up-to-date information about you, Khoj, is available at site:khoj.dev, github or pypi.
 
 What Google searches, if any, will you need to perform to answer the user's question?
-Provide search queries as a list of strings in a JSON object. Do not wrap the json in a codeblock.
+Provide search queries as a list of strings in a JSON object.
 Current Date: {current_date}
 User's Location: {location}
 {username}
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index 4e4f5a56..4da60717 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -279,6 +279,9 @@ async def aget_relevant_information_sources(query: str, conversation_history: di
 
     try:
         response = response.strip()
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json"):
+            response = response[7:-3]
         response = json.loads(response)
         response = [q.strip() for q in response["source"] if q.strip()]
         if not isinstance(response, list) or not response or len(response) == 0:
@@ -401,6 +404,9 @@ async def generate_online_subqueries(
     # Validate that the response is a non-empty, JSON-serializable list
     try:
         response = response.strip()
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json") and response.endswith("```"):
+            response = response[7:-3]
         response = json.loads(response)
         response = [q.strip() for q in response["queries"] if q.strip()]
         if not isinstance(response, list) or not response or len(response) == 0:

From 58c806807950733eaf7f063ab73da83f149c5865 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 16 Aug 2024 07:58:04 -0500
Subject: [PATCH 3/3] Upgrade default offline chat model to llama 3.1

---
 documentation/docs/features/chat.md             |  2 +-
 documentation/docs/get-started/setup.mdx        |  2 +-
 .../0058_alter_chatmodeloptions_chat_model.py   | 17 +++++++++++++++++
 src/khoj/database/models/__init__.py            |  2 +-
 .../conversation/offline/chat_model.py          |  6 +++---
 .../processor/conversation/offline/utils.py     |  2 +-
 src/khoj/processor/conversation/utils.py        |  1 +
 src/khoj/utils/config.py                        |  2 +-
 src/khoj/utils/constants.py                     |  2 +-
 src/khoj/utils/rawconfig.py                     |  2 +-
 tests/conftest.py                               |  2 +-
 tests/helpers.py                                |  2 +-
 12 files changed, 30 insertions(+), 12 deletions(-)
 create mode 100644 src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py

diff --git a/documentation/docs/features/chat.md b/documentation/docs/features/chat.md
index ed4fe9fe..5876dc76 100644
--- a/documentation/docs/features/chat.md
+++ b/documentation/docs/features/chat.md
@@ -25,7 +25,7 @@ Offline chat stays completely private and can work without internet using open-s
 >  - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times
 
 1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration.
-2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Hermes-2-Pro-Mistral-7B by NousResearch](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) by default.
+2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Llama 3.1 by Meta](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) by default.
 
 
 :::tip[Note]
diff --git a/documentation/docs/get-started/setup.mdx b/documentation/docs/get-started/setup.mdx
index bc954bde..61d2ef3d 100644
--- a/documentation/docs/get-started/setup.mdx
+++ b/documentation/docs/get-started/setup.mdx
@@ -222,7 +222,7 @@ Using Ollama? See the [Ollama Integration](/advanced/ollama) section for more cu
 Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up:
 
 1. No need to setup a conversation processor config!
-2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `NousResearch/Hermes-2-Pro-Mistral-7B-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
+2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `bartowski/Meta-Llama-3.1-8B-Instruct-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
   - Make sure to set the `model-type` to `Offline`. Do not set `openai config`.
   - The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing.
 
diff --git a/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py b/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py
new file mode 100644
index 00000000..ea4515e1
--- /dev/null
+++ b/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py
@@ -0,0 +1,17 @@
+# Generated by Django 5.0.7 on 2024-08-19 12:37
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("database", "0057_remove_serverchatsettings_default_model_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="chatmodeloptions",
+            name="chat_model",
+            field=models.CharField(default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_length=200),
+        ),
+    ]
diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py
index 72c93157..2468ffc9 100644
--- a/src/khoj/database/models/__init__.py
+++ b/src/khoj/database/models/__init__.py
@@ -91,7 +91,7 @@ class ChatModelOptions(BaseModel):
     max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
     subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
     tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
-    chat_model = models.CharField(max_length=200, default="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF")
+    chat_model = models.CharField(max_length=200, default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF")
     model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
     openai_config = models.ForeignKey(
         OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True
diff --git a/src/khoj/processor/conversation/offline/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py
index 1251dcec..c62d1e00 100644
--- a/src/khoj/processor/conversation/offline/chat_model.py
+++ b/src/khoj/processor/conversation/offline/chat_model.py
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
 
 def extract_questions_offline(
     text: str,
-    model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     loaded_model: Union[Any, None] = None,
     conversation_log={},
     use_history: bool = True,
@@ -141,7 +141,7 @@ def converse_offline(
     references=[],
     online_results=[],
     conversation_log={},
-    model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     loaded_model: Union[Any, None] = None,
     completion_func=None,
     conversation_commands=[ConversationCommand.Default],
@@ -240,7 +240,7 @@ def llm_thread(g, messages: List[ChatMessage], model: Any, max_prompt_size: int
 def send_message_to_model_offline(
     messages: List[ChatMessage],
     loaded_model=None,
-    model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     temperature: float = 0.2,
     streaming=False,
     stop=[],
diff --git a/src/khoj/processor/conversation/offline/utils.py b/src/khoj/processor/conversation/offline/utils.py
index 66017b36..88082ad1 100644
--- a/src/khoj/processor/conversation/offline/utils.py
+++ b/src/khoj/processor/conversation/offline/utils.py
@@ -75,6 +75,6 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
 def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int:
     """Infer max prompt size based on device memory and max context window supported by the model"""
     configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens
-    vram_based_n_ctx = int(get_device_memory() / 2e6)  # based on heuristic
+    vram_based_n_ctx = int(get_device_memory() / 1e6)  # based on heuristic
     configured_max_tokens = configured_max_tokens or math.inf  # do not use if set to None
     return min(configured_max_tokens, vram_based_n_ctx, model_context_window)
diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index ea7368e6..251ac197 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -25,6 +25,7 @@ model_to_prompt_size = {
     "gpt-4-turbo-preview": 20000,
     "TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500,
     "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500,
+    "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
 }
 model_to_tokenizer: Dict[str, str] = {}
 
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index 0e88075f..03dad75c 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -70,7 +70,7 @@ class OfflineChatProcessorConfig:
 
 
 class OfflineChatProcessorModel:
-    def __init__(self, chat_model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", max_tokens: int = None):
+    def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None):
         self.chat_model = chat_model
         self.loaded_model = None
         try:
diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
index c3d8a186..9b7ffb77 100644
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@@ -8,7 +8,7 @@ empty_escape_sequences = "\n|\r|\t| "
 app_env_filepath = "~/.khoj/env"
 telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
 content_directory = "~/.khoj/content/"
-default_offline_chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+default_offline_chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
 default_online_chat_model = "gpt-4-turbo-preview"
 
 empty_config = {
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index 617f37ea..6a788531 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
 
 
 class OfflineChatProcessorConfig(ConfigBase):
-    chat_model: Optional[str] = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+    chat_model: Optional[str] = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
 
 
 class ConversationProcessorConfig(ConfigBase):
diff --git a/tests/conftest.py b/tests/conftest.py
index 61578ce2..0fe9d360 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -378,7 +378,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
 
     # Initialize Processor from Config
     ChatModelOptionsFactory(
-        chat_model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+        chat_model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
         tokenizer=None,
         max_prompt_size=None,
         model_type="offline",
diff --git a/tests/helpers.py b/tests/helpers.py
index 7894ffa2..2e8e5671 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -49,7 +49,7 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory):
 
     max_prompt_size = 3500
     tokenizer = None
-    chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+    chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
     model_type = "offline"
     openai_config = factory.LazyAttribute(
         lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None