From 13b16a4364abf6114056a96f0a52c8e63736e738 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 3 Oct 2023 16:29:46 -0700
Subject: [PATCH 01/62] Use default Llama 2 supported by GPT4All

Remove custom logic to download custom Llama 2 model.
This was added as GPT4All didn't support Llama 2 when it was added to Khoj
---
 .../conversation/gpt4all/chat_model.py        |  4 +-
 .../conversation/gpt4all/model_metadata.py    |  3 -
 .../processor/conversation/gpt4all/utils.py   | 71 +------------------
 src/khoj/processor/conversation/utils.py      |  4 +-
 src/khoj/utils/config.py                      |  2 +-
 tests/test_gpt4all_chat_actors.py             |  2 +-
 6 files changed, 7 insertions(+), 79 deletions(-)
 delete mode 100644 src/khoj/processor/conversation/gpt4all/model_metadata.py

diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py
index 9bc9ea52..d713831a 100644
--- a/src/khoj/processor/conversation/gpt4all/chat_model.py
+++ b/src/khoj/processor/conversation/gpt4all/chat_model.py
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
 
 def extract_questions_offline(
     text: str,
-    model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
+    model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
     loaded_model: Union[Any, None] = None,
     conversation_log={},
     use_history: bool = True,
@@ -123,7 +123,7 @@ def converse_offline(
     references,
     user_query,
     conversation_log={},
-    model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
+    model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
     loaded_model: Union[Any, None] = None,
     completion_func=None,
     conversation_command=ConversationCommand.Default,
diff --git a/src/khoj/processor/conversation/gpt4all/model_metadata.py b/src/khoj/processor/conversation/gpt4all/model_metadata.py
deleted file mode 100644
index 065e3720..00000000
--- a/src/khoj/processor/conversation/gpt4all/model_metadata.py
+++ /dev/null
@@ -1,3 +0,0 @@
-model_name_to_url = {
-    "llama-2-7b-chat.ggmlv3.q4_K_S.bin": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_S.bin"
-}
diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py
index 4042fbe2..585df6a6 100644
--- a/src/khoj/processor/conversation/gpt4all/utils.py
+++ b/src/khoj/processor/conversation/gpt4all/utils.py
@@ -1,24 +1,8 @@
-import os
 import logging
-import requests
-import hashlib
 
-from tqdm import tqdm
-
-from khoj.processor.conversation.gpt4all import model_metadata
 
 logger = logging.getLogger(__name__)
 
-expected_checksum = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "cfa87b15d92fb15a2d7c354b0098578b"}
-
-
-def get_md5_checksum(filename: str):
-    hash_md5 = hashlib.md5()
-    with open(filename, "rb") as f:
-        for chunk in iter(lambda: f.read(8192), b""):
-            hash_md5.update(chunk)
-    return hash_md5.hexdigest()
-
 
 def download_model(model_name: str):
     try:
@@ -27,57 +11,4 @@ def download_model(model_name: str):
         logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
         raise e
 
-    url = model_metadata.model_name_to_url.get(model_name)
-    model_path = os.path.expanduser(f"~/.cache/gpt4all/")
-    if not url:
-        logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
-        return GPT4All(model_name=model_name, model_path=model_path)
-
-    filename = os.path.expanduser(f"~/.cache/gpt4all/{model_name}")
-    if os.path.exists(filename):
-        # Check if the user is connected to the internet
-        try:
-            requests.get("https://www.google.com/", timeout=5)
-        except:
-            logger.debug("User is offline. Disabling allowed download flag")
-            return GPT4All(model_name=model_name, model_path=model_path, allow_download=False)
-        return GPT4All(model_name=model_name, model_path=model_path)
-
-    # Download the model to a tmp file. Once the download is completed, move the tmp file to the actual file
-    tmp_filename = filename + ".tmp"
-
-    try:
-        os.makedirs(os.path.dirname(tmp_filename), exist_ok=True)
-        logger.debug(f"Downloading model {model_name} from {url} to {filename}...")
-        with requests.get(url, stream=True) as r:
-            r.raise_for_status()
-            total_size = int(r.headers.get("content-length", 0))
-            with open(tmp_filename, "wb") as f, tqdm(
-                unit="B",  # unit string to be displayed.
-                unit_scale=True,  # let tqdm to determine the scale in kilo, mega..etc.
-                unit_divisor=1024,  # is used when unit_scale is true
-                total=total_size,  # the total iteration.
-                desc=model_name,  # prefix to be displayed on progress bar.
-            ) as progress_bar:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-                    progress_bar.update(len(chunk))
-
-        # Verify the checksum
-        if expected_checksum.get(model_name) != get_md5_checksum(tmp_filename):
-            logger.error(
-                f"Checksum verification failed for {filename}. Removing the tmp file. Offline model will not be available."
-            )
-            os.remove(tmp_filename)
-            raise ValueError(f"Checksum verification failed for downloading {model_name} from {url}.")
-
-        # Move the tmp file to the actual file
-        os.rename(tmp_filename, filename)
-        logger.debug(f"Successfully downloaded model {model_name} from {url} to {filename}")
-        return GPT4All(model_name)
-    except Exception as e:
-        logger.error(f"Failed to download model {model_name} from {url} to {filename}. Error: {e}", exc_info=True)
-        # Remove the tmp file if it exists
-        if os.path.exists(tmp_filename):
-            os.remove(tmp_filename)
-        return None
+    return GPT4All(model_name=model_name)
diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index 4a92c367..ece526c2 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -17,10 +17,10 @@ logger = logging.getLogger(__name__)
 max_prompt_size = {
     "gpt-3.5-turbo": 4096,
     "gpt-4": 8192,
-    "llama-2-7b-chat.ggmlv3.q4_K_S.bin": 1548,
+    "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
     "gpt-3.5-turbo-16k": 15000,
 }
-tokenizer = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "hf-internal-testing/llama-tokenizer"}
+tokenizer = {"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer"}
 
 
 class ThreadedGenerator:
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index a6532346..f06d4c69 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -84,7 +84,7 @@ class SearchModels:
 
 @dataclass
 class GPT4AllProcessorConfig:
-    chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
+    chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
     loaded_model: Union[Any, None] = None
 
 
diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py
index d7904ff8..32ee4020 100644
--- a/tests/test_gpt4all_chat_actors.py
+++ b/tests/test_gpt4all_chat_actors.py
@@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
 
 from khoj.processor.conversation.utils import message_to_log
 
-MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
+MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
 
 
 @pytest.fixture(scope="session")

From d1ff812021a4c59a5d67495207ad90a0fe0be44d Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 4 Oct 2023 18:42:12 -0700
Subject: [PATCH 02/62] Run GPT4All Chat Model on GPU, when available

GPT4All now supports running models on GPU via Vulkan
---
 src/khoj/processor/conversation/gpt4all/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py
index 585df6a6..d5201780 100644
--- a/src/khoj/processor/conversation/gpt4all/utils.py
+++ b/src/khoj/processor/conversation/gpt4all/utils.py
@@ -11,4 +11,12 @@ def download_model(model_name: str):
         logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
         raise e
 
-    return GPT4All(model_name=model_name)
+    # Use GPU for Chat Model, if available
+    try:
+        model = GPT4All(model_name=model_name, device="gpu")
+        logger.debug("Loaded chat model to GPU.")
+    except ValueError:
+        model = GPT4All(model_name=model_name)
+        logger.debug("Loaded chat model to CPU.")
+
+    return model

From a85ff941ca49538ac6090e4d891e72710737744f Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 4 Oct 2023 20:39:31 -0700
Subject: [PATCH 03/62] Make offline chat model user configurable

Only GPT4All supported Llama v2 models will work given the prompt
structure is not currently configurable
---
 src/khoj/utils/config.py    | 3 ++-
 src/khoj/utils/rawconfig.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index f06d4c69..5accd2ad 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -84,7 +84,7 @@ class SearchModels:
 
 @dataclass
 class GPT4AllProcessorConfig:
-    chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+    chat_model: Optional[str] = None
     loaded_model: Union[Any, None] = None
 
 
@@ -95,6 +95,7 @@ class ConversationProcessorConfigModel:
     ):
         self.openai_model = conversation_config.openai
         self.gpt4all_model = GPT4AllProcessorConfig()
+        self.gpt4all_model.chat_model = conversation_config.offline_chat_model
         self.enable_offline_chat = conversation_config.enable_offline_chat
         self.conversation_logfile = Path(conversation_config.conversation_logfile)
         self.chat_session: List[str] = []
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index 0a916db4..30a98354 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -95,6 +95,7 @@ class ConversationProcessorConfig(ConfigBase):
     conversation_logfile: Path
     openai: Optional[OpenAIProcessorConfig]
     enable_offline_chat: Optional[bool] = False
+    offline_chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
 
 
 class ProcessorConfig(ConfigBase):

From 052b25af0a4da5a97aab21d8be05a89fc60cfaed Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Fri, 6 Oct 2023 12:29:15 -0700
Subject: [PATCH 04/62] Update default configuration passed to Khoj clients to
 circumvent valiation issues

---
 src/khoj/utils/constants.py | 53 +++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
index c5a67714..8da50d76 100644
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@@ -6,6 +6,59 @@ empty_escape_sequences = "\n|\r|\t| "
 app_env_filepath = "~/.khoj/env"
 telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
 
+empty_config = {
+    "content-type": {
+        "org": {
+            "input-files": None,
+            "input-filter": None,
+            "compressed-jsonl": "~/.khoj/content/org/org.jsonl.gz",
+            "embeddings-file": "~/.khoj/content/org/org_embeddings.pt",
+            "index-heading-entries": False,
+        },
+        "markdown": {
+            "input-files": None,
+            "input-filter": None,
+            "compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz",
+            "embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt",
+        },
+        "pdf": {
+            "input-files": None,
+            "input-filter": None,
+            "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
+            "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
+        },
+        "plaintext": {
+            "input-files": None,
+            "input-filter": None,
+            "compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz",
+            "embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt",
+        },
+    },
+    "search-type": {
+        "symmetric": {
+            "encoder": "sentence-transformers/all-MiniLM-L6-v2",
+            "cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+            "model_directory": "~/.khoj/search/symmetric/",
+        },
+        "asymmetric": {
+            "encoder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
+            "cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+            "model_directory": "~/.khoj/search/asymmetric/",
+        },
+        "image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
+    },
+    "processor": {
+        "conversation": {
+            "openai": {
+                "api-key": None,
+                "chat-model": "gpt-3.5-turbo",
+            },
+            "enable-offline-chat": False,
+            "conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
+        }
+    },
+}
+
 # default app config to use
 default_config = {
     "content-type": {

From 5c4f0d42b7961d5db7338bad9dd520659207e535 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Fri, 6 Oct 2023 12:30:09 -0700
Subject: [PATCH 05/62] Return new default config in API endpoint

---
 src/khoj/routers/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 2ff6bab0..db88324a 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -322,7 +322,7 @@ if not state.demo:
 # Create Routes
 @api.get("/config/data/default")
 def get_default_config_data():
-    return constants.default_config
+    return constants.empty_config
 
 
 @api.get("/config/types", response_model=List[str])

From f6f7a62d8076580e8794b18cee20ba86dd95a0e6 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 6 Oct 2023 12:39:19 -0700
Subject: [PATCH 06/62] Wait for user to stop typing to trigger search from
 khoj.el in Emacs

- Improves user experience by aligning idle time with search latency
  to avoid display jitter (to render results) while user is typing

- Makes the idle time configurable

Closes #480
---
 src/interface/emacs/khoj.el | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index 2f8360f2..e690b480 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -87,6 +87,12 @@
   :group 'khoj
   :type 'integer)
 
+(defcustom khoj-search-on-idle-time 0.3
+  "Idle time (in seconds) to wait before triggering search."
+  :group 'khoj
+  :type 'number)
+
+
 (defcustom khoj-default-content-type "org"
   "The default content type to perform search on."
   :group 'khoj
@@ -115,6 +121,9 @@
 (defvar khoj--content-type "org"
   "The type of content to perform search on.")
 
+(defvar khoj--search-on-idle-timer nil
+  "Idle timer to trigger incremental search.")
+
 (declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
 (declare-function org-element-type "org-mode" (ELEMENT))
 (declare-function markdown-mode "markdown-mode" ())
@@ -920,6 +929,9 @@ RECEIVE-DATE is the message receive date."
   (message "khoj.el: Teardown Incremental Search")
   ;; unset khoj minibuffer window
   (setq khoj--minibuffer-window nil)
+  (when (and khoj--search-on-idle-timer
+             (timerp khoj--search-on-idle-timer))
+    (cancel-timer khoj--search-on-idle-timer))
   ;; delete open connections to khoj server
   (khoj--delete-open-network-connections-to-server)
   ;; remove hooks for khoj incremental query and self
@@ -942,8 +954,10 @@ RECEIVE-DATE is the message receive date."
           ;; set current (mini-)buffer entered as khoj minibuffer
           ;; used to query khoj API only when user in khoj minibuffer
           (setq khoj--minibuffer-window (current-buffer))
-          (add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action
-          (add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit
+          ; do khoj incremental search after idle time
+          (setq khoj--search-on-idle-timer (run-with-idle-timer khoj-search-on-idle-time t #'khoj--incremental-search))
+          ; teardown khoj incremental search on minibuffer exit
+          (add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search))
       (read-string khoj--query-prompt))))
 
 

From 148e8f468f44880747a5aa59a6ac374217bd43dd Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 9 Oct 2023 19:30:58 -0700
Subject: [PATCH 07/62] Restrict openai package version below 1.0.0 to avoid
 breaking changes

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a52fc9b6..f352a83d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ dependencies = [
     "defusedxml == 0.7.1",
     "fastapi == 0.77.1",
     "jinja2 == 3.1.2",
-    "openai >= 0.27.0",
+    "openai >= 0.27.0, < 1.0.0",
     "tiktoken >= 0.3.2",
     "tenacity >= 8.2.2",
     "pillow == 9.3.0",

From 6aa69da3ef74340e205f3392b8e73327deff0b45 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 9 Oct 2023 21:35:58 -0700
Subject: [PATCH 08/62] Put indexer API endpoint under /api path segment

Update FastAPI app router, desktop app and to use new url path to
batch indexer API endpoint

All api endpoints should exist under /api path segment
---
 src/interface/desktop/main.js | 2 +-
 src/khoj/configure.py         | 2 +-
 tests/test_client.py          | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js
index 4f8891cf..83a19f36 100644
--- a/src/interface/desktop/main.js
+++ b/src/interface/desktop/main.js
@@ -169,7 +169,7 @@ function pushDataToKhoj (regenerate = false) {
 
     const hostURL = store.get('hostURL') || KHOJ_URL;
 
-    axios.post(`${hostURL}/v1/indexer/batch?regenerate=${regenerate}`, stream, { headers })
+    axios.post(`${hostURL}/api/v1/indexer/batch?regenerate=${regenerate}`, stream, { headers })
         .then(response => {
             console.log(response.data);
             const win = BrowserWindow.getAllWindows()[0];
diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 7e6cc409..c978735e 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -103,7 +103,7 @@ def configure_routes(app):
     app.mount("/static", StaticFiles(directory=constants.web_directory), name="static")
     app.include_router(api, prefix="/api")
     app.include_router(api_beta, prefix="/api/beta")
-    app.include_router(indexer, prefix="/v1/indexer")
+    app.include_router(indexer, prefix="/api/v1/indexer")
     app.include_router(web_client)
 
 
diff --git a/tests/test_client.py b/tests/test_client.py
index d2497f73..40a032f7 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -66,7 +66,7 @@ def test_index_batch(client):
     headers = {"x-api-key": "secret"}
 
     # Act
-    response = client.post("/v1/indexer/batch", json=request_body, headers=headers)
+    response = client.post("/api/v1/indexer/batch", json=request_body, headers=headers)
 
     # Assert
     assert response.status_code == 200
@@ -81,7 +81,7 @@ def test_regenerate_with_valid_content_type(client):
         headers = {"x-api-key": "secret"}
 
         # Act
-        response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
+        response = client.post(f"/api/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
         # Assert
         assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
 
@@ -97,7 +97,7 @@ def test_regenerate_with_github_fails_without_pat(client):
     headers = {"x-api-key": "secret"}
 
     # Act
-    response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
+    response = client.post(f"/api/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
     # Assert
     assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
 

From 9ba173bc2dc6ceb9434aac8d011a6e9e3fdf563c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 11 Oct 2023 17:12:03 -0700
Subject: [PATCH 09/62] Improve emoji, message on content index updated via
 logger

Use mailbox closed with flag down once content index completed.

Use standard, existing logger messages in new indexer messages, when
files to index sent by clients
---
 src/khoj/configure.py       | 2 +-
 src/khoj/routers/api.py     | 2 +-
 src/khoj/routers/indexer.py | 6 ++++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index c978735e..7b2b3ce2 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -117,7 +117,7 @@ if not state.demo:
             state.content_index = configure_content(
                 state.content_index, state.config.content_type, all_files, state.search_models
             )
-            logger.info("📬 Content index updated via Scheduler")
+            logger.info("📪 Content index updated via Scheduler")
         except Exception as e:
             logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True)
 
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index db88324a..5dd60a51 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -622,7 +622,7 @@ def update(
         if state.processor_config:
             components.append("Conversation processor")
         components_msg = ", ".join(components)
-        logger.info(f"📬 {components_msg} updated via API")
+        logger.info(f"📪 {components_msg} updated via API")
 
     update_telemetry_state(
         request=request,
diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index f5b2b418..94fc392d 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -85,6 +85,7 @@ async def index_batch(
         index_batch_request = IndexBatchRequest.parse_raw(index_batch_request_acc)
         logger.info(f"Received {len(index_batch_request.files)} files")
 
+        logger.info("📬 Updating content index via API")
         org_files: Dict[str, str] = {}
         markdown_files: Dict[str, str] = {}
         pdf_files: Dict[str, str] = {}
@@ -115,7 +116,7 @@ async def index_batch(
         )
 
         if state.config == None:
-            logger.info("First run, initializing state.")
+            logger.info("📬 Initializing content index on first run.")
             default_full_config = FullConfig(
                 content_type=None,
                 search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
@@ -148,9 +149,10 @@ async def index_batch(
         )
 
     except Exception as e:
-        logger.error(f"Failed to process batch indexing request: {e}", exc_info=True)
+        logger.error(f"🚨 Failed to update content index via API: {e}", exc_info=True)
     finally:
         state.config_lock.release()
+    logger.info("📪 Content index updated via API")
     return Response(content="OK", status_code=200)
 
 

From 60e9a616470dd8e6e0c043e50d3185eb278a8681 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 11 Oct 2023 17:14:15 -0700
Subject: [PATCH 10/62] Use multi-part form to receive files to index on server

- This uses existing HTTP affordance to process files
  - Better handling of binary file formats as removes need to url encode/decode
  - Less memory utilization than streaming json as files get
    automatically written to disk once memory utilization exceeds preset limits
  - No manual parsing of raw files streams required
---
 pyproject.toml              |  1 +
 src/khoj/routers/indexer.py | 31 ++++++-------------------------
 src/khoj/utils/helpers.py   | 24 ++++++++++++++----------
 3 files changed, 21 insertions(+), 35 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f352a83d..afd78848 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
     "dateparser >= 1.1.1",
     "defusedxml == 0.7.1",
     "fastapi == 0.77.1",
+    "python-multipart >= 0.0.5",
     "jinja2 == 3.1.2",
     "openai >= 0.27.0, < 1.0.0",
     "tiktoken >= 0.3.2",
diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index 94fc392d..86cd847f 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -1,10 +1,9 @@
 # Standard Packages
 import logging
-import sys
 from typing import Optional, Union, Dict
 
 # External Packages
-from fastapi import APIRouter, HTTPException, Header, Request, Body, Response
+from fastapi import APIRouter, HTTPException, Header, Response, UploadFile
 from pydantic import BaseModel
 
 # Internal Packages
@@ -58,7 +57,7 @@ class IndexerInput(BaseModel):
 
 @indexer.post("/batch")
 async def index_batch(
-    request: Request,
+    files: list[UploadFile],
     x_api_key: str = Header(None),
     regenerate: bool = False,
     search_type: Optional[Union[state.SearchType, str]] = None,
@@ -67,32 +66,14 @@ async def index_batch(
         raise HTTPException(status_code=401, detail="Invalid API Key")
     state.config_lock.acquire()
     try:
-        logger.info(f"Received batch indexing request")
-        index_batch_request_acc = b""
-        async for chunk in request.stream():
-            index_batch_request_acc += chunk
-        data_bytes = sys.getsizeof(index_batch_request_acc)
-        unit = "KB"
-        data_size = data_bytes / 1024
-        if data_size > 1000:
-            unit = "MB"
-            data_size = data_size / 1024
-        if data_size > 1000:
-            unit = "GB"
-            data_size = data_size / 1024
-        data_size_metric = f"{data_size:.2f} {unit}"
-        logger.info(f"Received {data_size_metric} of data")
-        index_batch_request = IndexBatchRequest.parse_raw(index_batch_request_acc)
-        logger.info(f"Received {len(index_batch_request.files)} files")
-
         logger.info("📬 Updating content index via API")
         org_files: Dict[str, str] = {}
         markdown_files: Dict[str, str] = {}
         pdf_files: Dict[str, str] = {}
         plaintext_files: Dict[str, str] = {}
 
-        for file in index_batch_request.files:
-            file_type = get_file_type(file.path)
+        for file in files:
+            file_type = get_file_type(file.content_type)
             dict_to_update = None
             if file_type == "org":
                 dict_to_update = org_files
@@ -104,9 +85,9 @@ async def index_batch(
                 dict_to_update = plaintext_files
 
             if dict_to_update is not None:
-                dict_to_update[file.path] = file.content
+                dict_to_update[file.filename] = file.file.read().decode("utf-8")
             else:
-                logger.info(f"Skipping unsupported streamed file: {file.path}")
+                logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}")
 
         indexer_input = IndexerInput(
             org=org_files,
diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py
index f8977043..3391a55d 100644
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -66,20 +66,24 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
     return merged_dict
 
 
-def get_file_type(filepath: str) -> str:
-    "Get file type from file path"
-    file_type = Path(filepath).suffix[1:]
+def get_file_type(file_type: str) -> str:
+    "Get file type from file mime type"
 
-    if file_type in ["md", "markdown"]:
+    file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
+    if file_type in ["text/markdown"]:
         return "markdown"
-    elif file_type in ["org", "orgmode"]:
+    elif file_type in ["text/org"]:
         return "org"
-    elif file_type in ["txt", "text", "html", "xml", "htm", "rst"]:
-        return "plaintext"
-    elif file_type in ["pdf"]:
+    elif file_type in ["application/pdf"]:
         return "pdf"
-
-    return file_type
+    elif file_type in ["image/jpeg"]:
+        return "jpeg"
+    elif file_type in ["image/png"]:
+        return "png"
+    elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
+        return "plaintext"
+    else:
+        return "other"
 
 
 def load_model(

From 72f8fde7efd335664155b8db4360335882c45f90 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Thu, 12 Oct 2023 16:19:48 -0700
Subject: [PATCH 11/62] Run pytests in parallel on multiple CPU cores using
 pytest-xdist for speed

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f352a83d..cdf8f284 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,7 @@ test = [
     "freezegun >= 1.2.0",
     "factory-boy >= 3.2.1",
     "trio >= 0.22.0",
+    "pytest-xdist",
 ]
 dev = [
     "khoj-assistant[test]",
@@ -111,7 +112,7 @@ warn_unused_ignores = false
 line-length = 120
 
 [tool.pytest.ini_options]
-addopts = "--strict-markers"
+addopts = "--strict-markers -n 4"
 markers = [
     "chatquality: Evaluate chatbot capabilities and quality",
 ]

From 7190b3811d82ca3179622ce9f3265bc608102513 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Thu, 12 Oct 2023 20:45:22 -0700
Subject: [PATCH 12/62] Remove all filter terms in user query from
 defiltered_query

Previously only the the last filter's terms were getting effectively
applied as the `filter.defilter' operation was being done on
`user_query' but was updating the `defiltered_query'
---
 src/khoj/routers/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index db88324a..ff2d88a2 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -387,7 +387,7 @@ async def search(
     # Encode query with filter terms removed
     defiltered_query = user_query
     for filter in [DateFilter(), WordFilter(), FileFilter()]:
-        defiltered_query = filter.defilter(user_query)
+        defiltered_query = filter.defilter(defiltered_query)
 
     encoded_asymmetric_query = None
     if t == SearchType.All or t != SearchType.Image:

From 68018ef3971c99c7cd64ada5b92cd0af7924d71e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 11 Oct 2023 18:12:12 -0700
Subject: [PATCH 13/62] Use multi-part form to send files to index on desktop
 client

- Add typing for variables in for loop and other minor formatting clean-up
- Assume utf8 encoding for text files and binary for image, pdf files
---
 src/interface/desktop/main.js | 137 ++++++++++++++++------------------
 1 file changed, 66 insertions(+), 71 deletions(-)

diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js
index 83a19f36..62493f54 100644
--- a/src/interface/desktop/main.js
+++ b/src/interface/desktop/main.js
@@ -8,7 +8,6 @@ const {dialog} = require('electron');
 
 const cron = require('cron').CronJob;
 const axios = require('axios');
-const { Readable } = require('stream');
 
 const KHOJ_URL = 'http://127.0.0.1:42110'
 
@@ -65,7 +64,7 @@ const schema = {
 
 var state = {}
 
-const store = new Store({schema});
+const store = new Store({ schema });
 
 console.log(store);
 
@@ -86,37 +85,48 @@ function handleSetTitle (event, title) {
     });
 }
 
+function filenameToMimeType (filename) {
+    const extension = filename.split('.').pop();
+    switch (extension) {
+        case 'pdf':
+            return 'application/pdf';
+        case 'png':
+            return 'image/png';
+        case 'jpg':
+            return 'image/jpeg';
+        case 'jpeg':
+            return 'image/jpeg';
+        case 'markdown':
+            return 'text/markdown';
+        case 'org':
+            return 'text/org';
+        default:
+            return 'text/plain';
+    }
+}
+
 function pushDataToKhoj (regenerate = false) {
     let filesToPush = [];
-    const files = store.get('files');
-    const folders = store.get('folders');
-    state = {
-        completed: true
+    const files = store.get('files') || [];
+    const folders = store.get('folders') || [];
+    state = { completed: true }
+
+    for (const file of files) {
+        filesToPush.push(file.path);
     }
 
-    if (files) {
-        for (file of files) {
-            filesToPush.push(file.path);
-        }
-    }
-    if (folders) {
-        for (folder of folders) {
-            const files = fs.readdirSync(folder.path, { withFileTypes: true });
-            for (file of files) {
-                if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
-                    filesToPush.push(path.join(folder.path, file.name));
-                }
+    for (const folder of folders) {
+        const files = fs.readdirSync(folder.path, { withFileTypes: true });
+        for (const file of files) {
+            if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
+                filesToPush.push(path.join(folder.path, file.name));
             }
         }
     }
 
-    let data = {
-        files: []
-    }
-
     const lastSync = store.get('lastSync') || [];
-
-    for (file of filesToPush) {
+    const formData = new FormData();
+    for (const file of filesToPush) {
         const stats = fs.statSync(file);
         if (!regenerate) {
             if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) {
@@ -125,18 +135,10 @@ function pushDataToKhoj (regenerate = false) {
         }
 
         try {
-            let rawData;
-            // If the file is a PDF or IMG file, read it as a binary file
-            if (binaryFileTypes.includes(file.split('.').pop())) {
-                rawData = fs.readFileSync(file).toString('base64');
-            } else {
-                rawData = fs.readFileSync(file, 'utf8');
-            }
-
-            data.files.push({
-                path: file,
-                content: rawData
-            });
+            encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8";
+            mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
+            fileObj = new Blob([fs.createReadStream(file, encoding)], { type: mimeType });
+            formData.append('files', fileObj, file);
             state[file] = {
                 success: true,
             }
@@ -151,44 +153,37 @@ function pushDataToKhoj (regenerate = false) {
 
     for (const syncedFile of lastSync) {
         if (!filesToPush.includes(syncedFile.path)) {
-            data.files.push({
-                path: syncedFile.path,
-                content: ""
-            });
+            fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) });
+            formData.append('files', fileObj, syncedFile.path);
         }
     }
 
-    const headers = { 'x-api-key': 'secret', 'Content-Type': 'application/json' };
-
-    const stream = new Readable({
-        read() {
-            this.push(JSON.stringify(data));
-            this.push(null);
-        }
-    });
-
-    const hostURL = store.get('hostURL') || KHOJ_URL;
-
-    axios.post(`${hostURL}/api/v1/indexer/batch?regenerate=${regenerate}`, stream, { headers })
-        .then(response => {
-            console.log(response.data);
-            const win = BrowserWindow.getAllWindows()[0];
-            win.webContents.send('update-state', state);
-            let lastSync = [];
-            for (const file of filesToPush) {
-                lastSync.push({
-                    path: file,
-                    datetime: new Date().toISOString()
-                });
-            }
-            store.set('lastSync', lastSync);
-        })
-        .catch(error => {
-            console.error(error);
-            state['completed'] = false
-            const win = BrowserWindow.getAllWindows()[0];
-            win.webContents.send('update-state', state);
-        });
+    if (!!formData?.entries()?.next().value) {
+        const hostURL = store.get('hostURL') || KHOJ_URL;
+        const headers = {
+            'x-api-key': 'secret'
+        };
+        axios.post(`${hostURL}/api/v1/indexer/batch?regenerate=${regenerate}`, formData, { headers })
+            .then(response => {
+                console.log(response.data);
+                const win = BrowserWindow.getAllWindows()[0];
+                win.webContents.send('update-state', state);
+                let lastSync = [];
+                for (const file of filesToPush) {
+                    lastSync.push({
+                        path: file,
+                        datetime: new Date().toISOString()
+                    });
+                }
+                store.set('lastSync', lastSync);
+            })
+            .catch(error => {
+                console.error(error);
+                state['completed'] = false
+                const win = BrowserWindow.getAllWindows()[0];
+                win.webContents.send('update-state', state);
+            });
+    }
 }
 
 pushDataToKhoj();

From fc9943175473701f2a32f87f841d827d9f62c276 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 11 Oct 2023 22:45:29 -0700
Subject: [PATCH 14/62] Send files to index on server from the khoj.el emacs
 client

- Add elisp variable to set API key to engage with the Khoj server
- Use multi-part form to POST the files to index to the indexer API
  endpoint on the khoj server
---
 src/interface/emacs/khoj.el | 46 +++++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index e690b480..3d103c0b 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -92,6 +92,10 @@
   :group 'khoj
   :type 'number)
 
+(defcustom khoj-server-api-key "secret"
+  "API Key to Khoj server."
+  :group 'khoj
+  :type 'string)
 
 (defcustom khoj-default-content-type "org"
   "The default content type to perform search on."
@@ -374,7 +378,7 @@ CONFIG is json obtained from Khoj config API."
           (string-join "/"))))
 
 (defun khoj--server-configure ()
-  "Configure the the Khoj server for search and chat."
+  "Configure the Khoj server for search and chat."
   (interactive)
   (let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null))
          (current-config
@@ -388,7 +392,6 @@ CONFIG is json obtained from Khoj config API."
          (default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
          (default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
          (chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config))))))
-         (default-model (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config))))
          (enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))
          (config (or current-config default-config)))
 
@@ -517,6 +520,45 @@ CONFIG is json obtained from Khoj config API."
       ;; Configure server once it's ready
       (khoj--server-configure))))
 
+
+;; -------------------
+;; Khoj Index Content
+;; -------------------
+
+(defun khoj--server-index-files (&optional file-paths)
+  "Send files to the Khoj server to index for search and chat."
+  (interactive)
+  (let ((boundary (format "-------------------------%d" (random (expt 10 10))))
+        (files-to-index (or file-paths
+                            (append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files))))
+
+    (let* ((url-request-method "POST")
+           (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
+                                        ("x-api-key" . ,khoj-server-api-key)))
+           ;; add files to index as form data
+           (url-request-data (with-temp-buffer
+                               (set-buffer-multibyte t)
+                               (insert "\n")
+                               (dolist (file-to-index files-to-index)
+                                 (insert (format "--%s\r\n" boundary))
+                                 (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
+                                 (insert "Content-Type: text/org\r\n\r\n")
+                                 (insert (with-temp-buffer
+                                           (insert-file-contents-literally file-to-index)
+                                           (buffer-string)))
+                                 (insert "\r\n"))
+                               (insert (format "--%s--\r\n" boundary))
+                               (buffer-string))))
+      (with-current-buffer
+          (url-retrieve (format "%s/api/v1/indexer/batch" khoj-server-url)
+                        ;; render response from indexing API endpoint on server
+                        (lambda (status)
+                          (with-current-buffer (current-buffer)
+                            (goto-char url-http-end-of-headers)
+                            (message "khoj.el: status: %s. response: %s" status (string-trim (buffer-substring-no-properties (point) (point-max))))))
+                        nil t t)))))
+
+
 
 ;; -----------------------------------------------
 ;; Extract and Render Entries of each Content Type

From bed3aff059b6de6ff8c6181d61928d1051368cf6 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Thu, 12 Oct 2023 16:16:51 -0700
Subject: [PATCH 15/62] Update tests to test multi-part/form method of pushing
 files to index

Instead of using the previous method to push data as json payload of POST request
pass it as files to upload via the multi-part/form to the batch indexer API endpoint
---
 tests/test_client.py | 50 +++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/tests/test_client.py b/tests/test_client.py
index 40a032f7..831668f7 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -62,11 +62,11 @@ def test_regenerate_with_invalid_content_type(client):
 # ----------------------------------------------------------------------------------------------------
 def test_index_batch(client):
     # Arrange
-    request_body = get_sample_files_data()
+    files = get_sample_files_data()
     headers = {"x-api-key": "secret"}
 
     # Act
-    response = client.post("/api/v1/indexer/batch", json=request_body, headers=headers)
+    response = client.post("/api/v1/indexer/batch", files=files, headers=headers)
 
     # Assert
     assert response.status_code == 200
@@ -76,12 +76,11 @@ def test_index_batch(client):
 def test_regenerate_with_valid_content_type(client):
     for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
         # Arrange
-        request_body = get_sample_files_data()
-
+        files = get_sample_files_data()
         headers = {"x-api-key": "secret"}
 
         # Act
-        response = client.post(f"/api/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
+        response = client.post(f"/api/v1/indexer/batch?search_type={content_type}", files=files, headers=headers)
         # Assert
         assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
 
@@ -92,12 +91,11 @@ def test_regenerate_with_github_fails_without_pat(client):
     response = client.get(f"/api/update?force=true&t=github")
 
     # Arrange
-    request_body = get_sample_files_data()
-
+    files = get_sample_files_data()
     headers = {"x-api-key": "secret"}
 
     # Act
-    response = client.post(f"/api/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
+    response = client.post(f"/api/v1/indexer/batch?search_type=github", files=files, headers=headers)
     # Assert
     assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
 
@@ -288,24 +286,20 @@ def test_notes_search_with_exclude_filter(
 
 def get_sample_files_data():
     return {
-        "org": {
-            "path/to/filename.org": "* practicing piano",
-            "path/to/filename1.org": "** top 3 reasons why I moved to SF",
-            "path/to/filename2.org": "* how to build a search engine",
-        },
-        "pdf": {
-            "path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
-            "path/to/filename1.pdf": "The sun is a ball of helium",
-            "path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
-        },
-        "plaintext": {
-            "path/to/filename.txt": "data,column,value",
-            "path/to/filename1.txt": "<html>my first web page</html>",
-            "path/to/filename2.txt": "2021-02-02 Journal Entry",
-        },
-        "markdown": {
-            "path/to/filename.md": "# Notes from client call",
-            "path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
-            "path/to/filename2.md": "**Understanding science through the lens of art**",
-        },
+        "files": ("path/to/filename.org", "* practicing piano", "text/org"),
+        "files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"),
+        "files": ("path/to/filename2.org", "* how to build a search engine", "text/org"),
+        "files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"),
+        "files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"),
+        "files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"),
+        "files": ("path/to/filename.txt", "data,column,value", "text/plain"),
+        "files": ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain"),
+        "files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"),
+        "files": ("path/to/filename.md", "# Notes from client call", "text/markdown"),
+        "files": (
+            "path/to/filename1.md",
+            "## Studying anthropological records from the Fatimid caliphate",
+            "text/markdown",
+        ),
+        "files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"),
     }

From 292f0420ad16efe2b39f318214a9aaac8f8c802c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Thu, 12 Oct 2023 20:32:41 -0700
Subject: [PATCH 16/62] Send content for indexing on server at a regular
 interval from khoj.el

- Allow indexing frequency to be configurable by user
- Ensure there is only one khoj indexing timer running
---
 src/interface/emacs/khoj.el | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index 3d103c0b..44c52601 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -97,6 +97,11 @@
   :group 'khoj
   :type 'string)
 
+(defcustom khoj-index-interval 3600
+  "Interval (in seconds) to wait before updating content index."
+  :group 'khoj
+  :type 'number)
+
 (defcustom khoj-default-content-type "org"
   "The default content type to perform search on."
   :group 'khoj
@@ -128,6 +133,9 @@
 (defvar khoj--search-on-idle-timer nil
   "Idle timer to trigger incremental search.")
 
+(defvar khoj--index-timer nil
+  "Timer to trigger content indexing.")
+
 (declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
 (declare-function org-element-type "org-mode" (ELEMENT))
 (declare-function markdown-mode "markdown-mode" ())
@@ -531,7 +539,6 @@ CONFIG is json obtained from Khoj config API."
   (let ((boundary (format "-------------------------%d" (random (expt 10 10))))
         (files-to-index (or file-paths
                             (append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files))))
-
     (let* ((url-request-method "POST")
            (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
                                         ("x-api-key" . ,khoj-server-api-key)))
@@ -555,9 +562,15 @@ CONFIG is json obtained from Khoj config API."
                         (lambda (status)
                           (with-current-buffer (current-buffer)
                             (goto-char url-http-end-of-headers)
-                            (message "khoj.el: status: %s. response: %s" status (string-trim (buffer-substring-no-properties (point) (point-max))))))
+                            (message "khoj.el: Update Content Index. Status: %s. response: %s" status (string-trim (buffer-substring-no-properties (point) (point-max))))))
                         nil t t)))))
 
+;; Cancel any running indexing timer
+(when khoj--index-timer
+    (cancel-timer khoj--index-timer))
+;; Send files to index on server every `khoj-index-interval' seconds
+(setq khoj--index-timer
+      (run-with-timer 60 khoj-index-interval 'khoj--server-index-files))
 
 
 ;; -----------------------------------------------

From bea196aa30f91baa8cccb7e00f032e021c9ab000 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Thu, 12 Oct 2023 20:40:39 -0700
Subject: [PATCH 17/62] Explicitly make GET request to /config/data from
 khoj.el:khoj-server-configure method

Previously global state of `url-request-method' would affect the
kind of request made to api/config/data API endpoint as it wasn't
being explicitly being set before calling the API endpoint

This was done with the assumption that the default value of GET for
url-request-method wouldn't change globally

But in some cases, experientially, it can get changed. This was
resulting in khoj.el load failing as POST request was being made
instead which would throw error
---
 src/interface/emacs/khoj.el | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index 44c52601..cccdc12c 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -389,6 +389,7 @@ CONFIG is json obtained from Khoj config API."
   "Configure the Khoj server for search and chat."
   (interactive)
   (let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null))
+         (url-request-method "GET")
          (current-config
           (with-temp-buffer
             (url-insert-file-contents (format "%s/api/config/data" khoj-server-url))
@@ -573,9 +574,9 @@ CONFIG is json obtained from Khoj config API."
       (run-with-timer 60 khoj-index-interval 'khoj--server-index-files))
 
 
-;; -----------------------------------------------
-;; Extract and Render Entries of each Content Type
-;; -----------------------------------------------
+;; -------------------------------------------
+;; Render Response from Khoj server for Emacs
+;; -------------------------------------------
 
 (defun khoj--extract-entries-as-markdown (json-response query)
   "Convert JSON-RESPONSE, QUERY from API to markdown entries."

From b669aa23955ac032b392a3544bf537230f3ed605 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 13 Oct 2023 18:00:37 -0700
Subject: [PATCH 18/62] Clean and fix the content indexing code in the Emacs
 client

- Pass payloads as unibyte. This was causing the request to fail for
  files with unicode characters
- Suppress messages with file content in on index updates
- Fix rendering response from server on index update API call
- Extract code to populate body of index update HTTP request with files
---
 src/interface/emacs/khoj.el | 54 +++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index cccdc12c..1e7f9032 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -535,38 +535,46 @@ CONFIG is json obtained from Khoj config API."
 ;; -------------------
 
 (defun khoj--server-index-files (&optional file-paths)
-  "Send files to the Khoj server to index for search and chat."
+  "Send files at `FILE-PATHS' to the Khoj server to index for search and chat."
   (interactive)
   (let ((boundary (format "-------------------------%d" (random (expt 10 10))))
         (files-to-index (or file-paths
-                            (append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files))))
-    (let* ((url-request-method "POST")
-           (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
-                                        ("x-api-key" . ,khoj-server-api-key)))
-           ;; add files to index as form data
-           (url-request-data (with-temp-buffer
-                               (set-buffer-multibyte t)
-                               (insert "\n")
-                               (dolist (file-to-index files-to-index)
-                                 (insert (format "--%s\r\n" boundary))
-                                 (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
-                                 (insert "Content-Type: text/org\r\n\r\n")
-                                 (insert (with-temp-buffer
-                                           (insert-file-contents-literally file-to-index)
-                                           (buffer-string)))
-                                 (insert "\r\n"))
-                               (insert (format "--%s--\r\n" boundary))
-                               (buffer-string))))
+                            (append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files)))
+        (inhibit-message t)
+        (message-log-max nil))
+    (let ((url-request-method "POST")
+          (url-request-data (khoj--render-files-as-request-body files-to-index boundary))
+          (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
+                                       ("x-api-key" . ,khoj-server-api-key))))
       (with-current-buffer
           (url-retrieve (format "%s/api/v1/indexer/batch" khoj-server-url)
                         ;; render response from indexing API endpoint on server
                         (lambda (status)
-                          (with-current-buffer (current-buffer)
-                            (goto-char url-http-end-of-headers)
-                            (message "khoj.el: Update Content Index. Status: %s. response: %s" status (string-trim (buffer-substring-no-properties (point) (point-max))))))
+                          (if (not status)
+                              (message "khoj.el: Updated Content Index")
+                            (with-current-buffer (current-buffer)
+                              (goto-char "\n\n")
+                              (message "khoj.el: Failed to update Content Index. Status: %s. Response: %s" status (string-trim (buffer-substring-no-properties (point) (point-max)))))))
                         nil t t)))))
 
-;; Cancel any running indexing timer
+(defun khoj--render-files-as-request-body (files-to-index boundary)
+  "Render `FILES-TO-INDEX' as multi-part form body using `BOUNDARY'.
+This is sent to Khoj server as a POST request."
+  (with-temp-buffer
+    (set-buffer-multibyte nil)
+    (insert "\n")
+    (dolist (file-to-index files-to-index)
+      (insert (format "--%s\r\n" boundary))
+      (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
+      (insert "Content-Type: text/org\r\n\r\n")
+      (insert (with-temp-buffer
+                (insert-file-contents-literally file-to-index)
+                (buffer-string)))
+      (insert "\r\n"))
+    (insert (format "--%s--\r\n" boundary))
+    (buffer-string)))
+
+;; Cancel any running indexing timer, first
 (when khoj--index-timer
     (cancel-timer khoj--index-timer))
 ;; Send files to index on server every `khoj-index-interval' seconds

From 80fb56b8a5e633702f08e4213c1d432eb07a629f Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Fri, 13 Oct 2023 19:23:00 -0700
Subject: [PATCH 19/62] Sync deksktop app package version with the other
 releases

---
 src/interface/desktop/package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interface/desktop/package.json b/src/interface/desktop/package.json
index fb2d9983..7fc07912 100644
--- a/src/interface/desktop/package.json
+++ b/src/interface/desktop/package.json
@@ -2,7 +2,7 @@
   "name": "Khoj",
   "homepage": ".",
   "productName": "Khoj",
-  "version": "1.0.2",
+  "version": "0.12.3",
   "description": "Scaffolding for the desktop entrypoint to Khoj",
   "main": "main.js",
   "repository": "\"https://github.com/khoj-ai/khoj\"",

From 96c0b212856aafb1763b264a54bfe15805eca61c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 13 Oct 2023 20:22:33 -0700
Subject: [PATCH 20/62] Sync desktop app package.json with other Khoj clients
 metadata

- Make `bump_version.sh' script set version for the Khoj desktop app too
- Sync Khoj desktop app authors, license, description and version with
  the other interfaces and server
- Update description in packages metadata to match project subtitle on Github
---
 pyproject.toml                      |  2 +-
 scripts/bump_version.sh             |  4 ++++
 src/interface/desktop/package.json  | 12 ++++++------
 src/interface/emacs/khoj.el         | 13 +++++++------
 src/interface/obsidian/package.json |  6 +++---
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cdf8f284..d0890e7b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "khoj-assistant"
-description = "An AI personal assistant for your Digital Brain"
+description = "An AI copilot for your Second Brain"
 readme = "README.md"
 license = "GPL-3.0-or-later"
 requires-python = ">=3.8"
diff --git a/scripts/bump_version.sh b/scripts/bump_version.sh
index 07d2117f..561953dd 100755
--- a/scripts/bump_version.sh
+++ b/scripts/bump_version.sh
@@ -9,6 +9,10 @@ do
             # Get current project version
             current_version=$OPTARG
 
+            # Bump Desktop app to current version
+            cd $project_root/src/interface/desktop
+            sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
+
             # Bump Obsidian plugin to current version
             cd $project_root/src/interface/obsidian
             sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
diff --git a/src/interface/desktop/package.json b/src/interface/desktop/package.json
index 7fc07912..0b5f220c 100644
--- a/src/interface/desktop/package.json
+++ b/src/interface/desktop/package.json
@@ -1,13 +1,13 @@
 {
   "name": "Khoj",
-  "homepage": ".",
-  "productName": "Khoj",
   "version": "0.12.3",
-  "description": "Scaffolding for the desktop entrypoint to Khoj",
-  "main": "main.js",
+  "description": "An AI copilot for your Second Brain",
+  "author": "Saba Imran, Debanjum Singh Solanky <team@khoj.dev>",
+  "license": "GPL-3.0-or-later",
+  "homepage": "https://khoj.dev",
   "repository": "\"https://github.com/khoj-ai/khoj\"",
-  "author": "Khoj <team@khoj.dev>",
-  "license": "MIT",
+  "productName": "Khoj",
+  "main": "main.js",
   "private": false,
   "devDependencies": {
     "electron": "25.8.1"
diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index e690b480..09de2f93 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -1,9 +1,10 @@
-;;; khoj.el --- AI personal assistant for your digital brain -*- lexical-binding: t -*-
+;;; khoj.el --- AI copilot for your Second Brain -*- lexical-binding: t -*-
 
-;; Copyright (C) 2021-2022 Debanjum Singh Solanky
+;; Copyright (C) 2021-2023 Khoj Inc.
 
-;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
-;; Description: An AI personal assistant for your digital brain
+;; Author: Debanjum Singh Solanky <debanjum@khoj.dev>
+;;         Saba Imran <saba@khoj.dev>
+;; Description: An AI copilot for your Second Brain
 ;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
 ;; Version: 0.12.3
 ;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
@@ -28,8 +29,8 @@
 
 ;;; Commentary:
 
-;; Create an AI personal assistant for your `org-mode', `markdown' notes,
-;; PDFs and images. The assistant exposes 2 modes, search and chat:
+;; Create an AI copilot to your `org-mode', `markdown' notes,
+;; PDFs and images. The copilot exposes 2 modes, search and chat:
 ;;
 ;; Chat provides faster answers, iterative discovery and assisted
 ;; creativity. It requires your OpenAI API key to access GPT models
diff --git a/src/interface/obsidian/package.json b/src/interface/obsidian/package.json
index eb18132f..07c47140 100644
--- a/src/interface/obsidian/package.json
+++ b/src/interface/obsidian/package.json
@@ -1,7 +1,9 @@
 {
     "name": "Khoj",
     "version": "0.12.3",
-    "description": "An AI Personal Assistant for your Digital Brain",
+    "description": "An AI copilot for your Second Brain",
+    "author": "Debanjum Singh Solanky, Saba Imran <team@khoj.dev>",
+    "license": "GPL-3.0-or-later",
     "main": "src/main.js",
     "scripts": {
         "dev": "node esbuild.config.mjs",
@@ -14,8 +16,6 @@
         "AI",
         "assistant"
     ],
-    "author": "Debanjum Singh Solanky",
-    "license": "GPL-3.0-or-later",
     "devDependencies": {
         "@types/node": "^16.11.6",
         "@typescript-eslint/eslint-plugin": "5.29.0",

From 09bb3686ccb9b52f36eae3d3f806e63f4853f54a Mon Sep 17 00:00:00 2001
From: sabaimran <65192171+sabaimran@users.noreply.github.com>
Date: Fri, 13 Oct 2023 21:11:23 -0700
Subject: [PATCH 21/62] Strip the incoming query from the slash conversation
 command (#500)

* Strip the incoming query from the slash conversation command before passing it to the model or for search
* Return q when content index not loaded
* Remove -n 4 from pytest ini configuration to isolate test failures
---
 pyproject.toml          |  2 +-
 src/khoj/routers/api.py | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d0890e7b..193c0cc3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -112,7 +112,7 @@ warn_unused_ignores = false
 line-length = 120
 
 [tool.pytest.ini_options]
-addopts = "--strict-markers -n 4"
+addopts = "--strict-markers"
 markers = [
     "chatquality: Evaluate chatbot capabilities and quality",
 ]
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index ff2d88a2..780a6c57 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -702,10 +702,16 @@ async def chat(
 ) -> Response:
     perform_chat_checks()
     conversation_command = get_conversation_command(query=q, any_references=True)
+
+    q = q.replace(f"/{conversation_command.value}", "").strip()
+
     compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions(
         request, q, (n or 5), conversation_command
     )
-    conversation_command = get_conversation_command(query=q, any_references=not is_none_or_empty(compiled_references))
+
+    if conversation_command == ConversationCommand.Default and is_none_or_empty(compiled_references):
+        conversation_command = ConversationCommand.General
+
     if conversation_command == ConversationCommand.Help:
         model_type = "offline" if state.processor_config.conversation.enable_offline_chat else "openai"
         formatted_help = help_message.format(model=model_type, version=state.khoj_version)
@@ -768,18 +774,16 @@ async def extract_references_and_questions(
         logger.warning(
             "No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
         )
-        return compiled_references, inferred_queries
+        return compiled_references, inferred_queries, q
 
     if conversation_type == ConversationCommand.General:
         return compiled_references, inferred_queries, q
 
     # Extract filter terms from user message
     defiltered_query = q
-    filter_terms = []
     for filter in [DateFilter(), WordFilter(), FileFilter()]:
-        filter_terms += filter.get_filter_terms(q)
-        defiltered_query = filter.defilter(q)
-    filters_in_query = " ".join(filter_terms)
+        defiltered_query = filter.defilter(defiltered_query)
+    filters_in_query = q.replace(defiltered_query, "").strip()
 
     # Infer search queries from user message
     with timer("Extracting search queries took", logger):

From 56bd69d5af036a09223bd1c3b596fe83443401ef Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 4 Oct 2023 20:42:25 -0700
Subject: [PATCH 22/62] Improve Llama v2 extract questions actor and associated
 prompt

- Format extract questions prompt format with newlines and whitespaces
- Make llama v2 extract questions prompt consistent

- Remove empty questions extracted by offline extract_questions actor
- Update implicit qs extraction unit test for offline search actor
---
 .../conversation/gpt4all/chat_model.py        |  2 +-
 src/khoj/processor/conversation/prompts.py    | 38 +++++++++++--------
 tests/test_gpt4all_chat_actors.py             |  6 +--
 3 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py
index d713831a..e9beaa80 100644
--- a/src/khoj/processor/conversation/gpt4all/chat_model.py
+++ b/src/khoj/processor/conversation/gpt4all/chat_model.py
@@ -113,7 +113,7 @@ def filter_questions(questions: List[str]):
     ]
     filtered_questions = []
     for q in questions:
-        if not any([word in q.lower() for word in hint_words]):
+        if not any([word in q.lower() for word in hint_words]) and not is_none_or_empty(q):
             filtered_questions.append(q)
 
     return filtered_questions
diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
index 4de3c623..d487609d 100644
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -23,7 +23,7 @@ no_notes_found = PromptTemplate.from_template(
     """.strip()
 )
 
-system_prompt_message_llamav2 = f"""You are Khoj, a friendly, smart and helpful personal assistant.
+system_prompt_message_llamav2 = f"""You are Khoj, a smart, inquisitive and helpful personal assistant.
 Using your general knowledge and our past conversations as context, answer the following question.
 If you do not know the answer, say 'I don't know.'"""
 
@@ -51,13 +51,13 @@ extract_questions_system_prompt_llamav2 = PromptTemplate.from_template(
 
 general_conversation_llamav2 = PromptTemplate.from_template(
     """
-<s>[INST]{query}[/INST]
+<s>[INST] {query} [/INST]
 """.strip()
 )
 
 chat_history_llamav2_from_user = PromptTemplate.from_template(
     """
-<s>[INST]{message}[/INST]
+<s>[INST] {message} [/INST]
 """.strip()
 )
 
@@ -69,7 +69,7 @@ chat_history_llamav2_from_assistant = PromptTemplate.from_template(
 
 conversation_llamav2 = PromptTemplate.from_template(
     """
-<s>[INST]{query}[/INST]
+<s>[INST] {query} [/INST]
 """.strip()
 )
 
@@ -91,7 +91,7 @@ Question: {query}
 
 notes_conversation_llamav2 = PromptTemplate.from_template(
     """
-Notes:
+User's Notes:
 {references}
 Question: {query}
 """.strip()
@@ -134,19 +134,25 @@ Answer (in second person):"""
 
 extract_questions_llamav2_sample = PromptTemplate.from_template(
     """
-<s>[INST]<<SYS>>Current Date: {current_date}<</SYS>>[/INST]</s>
-<s>[INST]How was my trip to Cambodia?[/INST][]</s>
-<s>[INST]Who did I visit the temple with on that trip?[/INST]Who did I visit the temple with in Cambodia?</s>
-<s>[INST]How should I take care of my plants?[/INST]What kind of plants do I have? What issues do my plants have?</s>
-<s>[INST]How many tennis balls fit in the back of a 2002 Honda Civic?[/INST]What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
-<s>[INST]What did I do for Christmas last year?[/INST]What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
-<s>[INST]How are you feeling today?[/INST]</s>
-<s>[INST]Is Alice older than Bob?[/INST]When was Alice born? What is Bob's age?</s>
-<s>[INST]<<SYS>>
+<s>[INST] <<SYS>>Current Date: {current_date}<</SYS>> [/INST]</s>
+<s>[INST] How was my trip to Cambodia? [/INST]
+How was my trip to Cambodia?</s>
+<s>[INST] Who did I visit the temple with on that trip? [/INST]
+Who did I visit the temple with in Cambodia?</s>
+<s>[INST] How should I take care of my plants? [/INST]
+What kind of plants do I have? What issues do my plants have?</s>
+<s>[INST] How many tennis balls fit in the back of a 2002 Honda Civic? [/INST]
+What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
+<s>[INST] What did I do for Christmas last year? [/INST]
+What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
+<s>[INST] How are you feeling today? [/INST]</s>
+<s>[INST] Is Alice older than Bob? [/INST]
+When was Alice born? What is Bob's age?</s>
+<s>[INST] <<SYS>>
 Use these notes from the user's previous conversations to provide a response:
 {chat_history}
-<</SYS>>[/INST]</s>
-<s>[INST]{query}[/INST]
+<</SYS>> [/INST]</s>
+<s>[INST] {query} [/INST]
 """
 )
 
diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py
index 32ee4020..056618be 100644
--- a/tests/test_gpt4all_chat_actors.py
+++ b/tests/test_gpt4all_chat_actors.py
@@ -128,15 +128,15 @@ def test_extract_multiple_explicit_questions_from_message(loaded_model):
 @pytest.mark.chatquality
 def test_extract_multiple_implicit_questions_from_message(loaded_model):
     # Act
-    response = extract_questions_offline("Is Morpheus taller than Neo?", loaded_model=loaded_model)
+    response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model)
 
     # Assert
-    expected_responses = ["height", "taller", "shorter", "heights"]
+    expected_responses = ["height", "taller", "shorter", "heights", "who"]
     assert len(response) <= 3
 
     for question in response:
         assert any([expected_response in question.lower() for expected_response in expected_responses]), (
-            "Expected chat actor to ask follow-up questions about Morpheus and Neo, but got: " + question
+            "Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question
         )
 
 

From 1ad8b150e88061d5cea295b610be2185c8532047 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 13 Oct 2023 22:26:59 -0700
Subject: [PATCH 23/62] Add default tokenizer, max_prompt as fallback for
 non-default offline chat models

Pass user configured chat model as argument to use by converse_offline

The proper fix for this would allow users to configure the max_prompt
and tokenizer to use (while supplying default ones, if none provided)
For now, this is a reasonable start.
---
 pyproject.toml                           |  4 ++--
 src/khoj/processor/conversation/utils.py | 12 +++++++++---
 src/khoj/routers/helpers.py              |  1 +
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a52fc9b6..e6773b88 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,8 +59,8 @@ dependencies = [
     "bs4 >= 0.0.1",
     "anyio == 3.7.1",
     "pymupdf >= 1.23.3",
-    "gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
-    "gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
+    "gpt4all >= 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "gpt4all >= 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
 ]
 dynamic = ["version"]
 
diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index ece526c2..96c4c1c8 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -19,8 +19,12 @@ max_prompt_size = {
     "gpt-4": 8192,
     "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
     "gpt-3.5-turbo-16k": 15000,
+    "default": 1600,
+}
+tokenizer = {
+    "llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
+    "default": "hf-internal-testing/llama-tokenizer",
 }
-tokenizer = {"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer"}
 
 
 class ThreadedGenerator:
@@ -105,7 +109,7 @@ def generate_chatml_messages_with_context(
     messages = user_chatml_message + rest_backnforths + system_chatml_message
 
     # Truncate oldest messages from conversation history until under max supported prompt size by model
-    messages = truncate_messages(messages, max_prompt_size[model_name], model_name)
+    messages = truncate_messages(messages, max_prompt_size.get(model_name, max_prompt_size["default"]), model_name)
 
     # Return message in chronological order
     return messages[::-1]
@@ -116,8 +120,10 @@ def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name)
 
     if "llama" in model_name:
         encoder = LlamaTokenizerFast.from_pretrained(tokenizer[model_name])
-    else:
+    elif "gpt" in model_name:
         encoder = tiktoken.encoding_for_model(model_name)
+    else:
+        encoder = LlamaTokenizerFast.from_pretrained(tokenizer["default"])
 
     system_message = messages.pop()
     system_message_tokens = len(encoder.encode(system_message.content))
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index 267af330..3898d1b8 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -122,6 +122,7 @@ def generate_chat_response(
                 conversation_log=meta_log,
                 completion_func=partial_completion,
                 conversation_command=conversation_command,
+                model=state.processor_config.conversation.gpt4all_model.chat_model,
             )
 
         elif state.processor_config.conversation.openai_model:

From ff2dbadc9d45c31bbb686836b640edc79f3e944f Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Sat, 14 Oct 2023 13:28:34 -0700
Subject: [PATCH 24/62] Use computed plaintext_content to set file content
 rather than calling f.read again

---
 src/khoj/utils/fs_syncer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py
index d303d39b..8f398104 100644
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -74,7 +74,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
                 plaintext_content = f.read()
                 if file.endswith(("html", "htm", "xml")):
                     plaintext_content = extract_html_content(plaintext_content)
-                filename_to_content_map[file] = f.read()
+                filename_to_content_map[file] = plaintext_content
             except Exception as e:
                 logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
                 logger.warning(e, exc_info=True)

From 247e75595c3377529497597dbd4a0fe4ef6cb0a3 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 14 Oct 2023 16:54:52 -0700
Subject: [PATCH 25/62] Use AutoTokenizer to support more tokenizers

---
 src/khoj/processor/conversation/utils.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index 96c4c1c8..7bb86887 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -7,7 +7,7 @@ import tiktoken
 
 # External packages
 from langchain.schema import ChatMessage
-from transformers import LlamaTokenizerFast
+from transformers import AutoTokenizer
 
 # Internal Packages
 import queue
@@ -115,15 +115,13 @@ def generate_chatml_messages_with_context(
     return messages[::-1]
 
 
-def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name) -> list[ChatMessage]:
+def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name: str) -> list[ChatMessage]:
     """Truncate messages to fit within max prompt size supported by model"""
 
-    if "llama" in model_name:
-        encoder = LlamaTokenizerFast.from_pretrained(tokenizer[model_name])
-    elif "gpt" in model_name:
+    if model_name.startswith("gpt-"):
         encoder = tiktoken.encoding_for_model(model_name)
     else:
-        encoder = LlamaTokenizerFast.from_pretrained(tokenizer["default"])
+        encoder = AutoTokenizer.from_pretrained(tokenizer.get(model_name, tokenizer["default"]))
 
     system_message = messages.pop()
     system_message_tokens = len(encoder.encode(system_message.content))

From feb4f17e3d3e8aaabcf5a41c3be4f9d1914ec5b8 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 15 Oct 2023 14:19:29 -0700
Subject: [PATCH 26/62] Update chat config schema. Make max_prompt, chat
 tokenizer configurable

This provides flexibility to use non 1st party supported chat models

- Create migration script to update khoj.yml config
  - Put `enable_offline_chat' under new `offline-chat' section
    Referring code needs to be updated to accomodate this change
  - Move `offline_chat_model' to `chat-model' under new `offline-chat' section
  - Put chat `tokenizer` under new `offline-chat' section
  - Put `max_prompt' under existing `conversation' section
    As `max_prompt' size effects both openai and offline chat models
---
 src/khoj/configure.py                         |  6 +-
 src/khoj/interface/web/config.html            | 14 ++--
 .../migrations/migrate_offline_chat_schema.py | 83 +++++++++++++++++++
 src/khoj/routers/api.py                       | 10 +--
 src/khoj/routers/helpers.py                   |  2 +-
 src/khoj/utils/cli.py                         |  8 +-
 src/khoj/utils/config.py                      |  6 +-
 src/khoj/utils/rawconfig.py                   | 10 ++-
 tests/conftest.py                             |  4 +-
 9 files changed, 119 insertions(+), 24 deletions(-)
 create mode 100644 src/khoj/migrations/migrate_offline_chat_schema.py

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 7e6cc409..769f015c 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -19,7 +19,7 @@ from khoj.utils.config import (
 )
 from khoj.utils.helpers import resolve_absolute_path, merge_dicts
 from khoj.utils.fs_syncer import collect_files
-from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig
+from khoj.utils.rawconfig import FullConfig, OfflineChatProcessorConfig, ProcessorConfig, ConversationProcessorConfig
 from khoj.routers.indexer import configure_content, load_content, configure_search
 
 
@@ -168,9 +168,7 @@ def configure_conversation_processor(
             conversation_config=ConversationProcessorConfig(
                 conversation_logfile=conversation_logfile,
                 openai=(conversation_config.openai if (conversation_config is not None) else None),
-                enable_offline_chat=(
-                    conversation_config.enable_offline_chat if (conversation_config is not None) else False
-                ),
+                offline_chat=conversation_config.offline_chat if conversation_config else OfflineChatProcessorConfig(),
             )
         )
     else:
diff --git a/src/khoj/interface/web/config.html b/src/khoj/interface/web/config.html
index 3b295a88..d41ca26b 100644
--- a/src/khoj/interface/web/config.html
+++ b/src/khoj/interface/web/config.html
@@ -236,7 +236,7 @@
                     </h3>
                 </div>
                 <div class="card-description-row">
-                <p class="card-description">Setup chat using OpenAI</p>
+                <p class="card-description">Setup online chat using OpenAI</p>
                 </div>
                 <div class="card-action-row">
                     <a class="card-button" href="/config/processor/conversation/openai">
@@ -261,21 +261,21 @@
                     <img class="card-icon" src="/static/assets/icons/chat.svg" alt="Chat">
                     <h3 class="card-title">
                         Offline Chat
-                        <img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
-                        {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and not current_model_state.conversation_gpt4all %}
+                        <img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
+                        {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and not current_model_state.conversation_gpt4all %}
                             <img id="misconfigured-icon-conversation-enable-offline-chat" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="The model was not downloaded as expected.">
                         {% endif %}
                     </h3>
                 </div>
                 <div class="card-description-row">
-                <p class="card-description">Setup offline chat (Llama V2)</p>
+                <p class="card-description">Setup offline chat</p>
                 </div>
-                <div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
+                <div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
                     <button class="card-button" onclick="toggleEnableLocalLLLM(false)">
                         Disable
                     </button>
                 </div>
-                <div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
+                <div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
                     <button class="card-button happy" onclick="toggleEnableLocalLLLM(true)">
                         Enable
                     </button>
@@ -346,7 +346,7 @@
             featuresHintText.classList.add("show");
         }
 
-        fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
+        fetch('/api/config/data/processor/conversation/offline_chat' + '?enable_offline_chat=' + enable, {
             method: 'POST',
             headers: {
                 'Content-Type': 'application/json',
diff --git a/src/khoj/migrations/migrate_offline_chat_schema.py b/src/khoj/migrations/migrate_offline_chat_schema.py
new file mode 100644
index 00000000..873783a3
--- /dev/null
+++ b/src/khoj/migrations/migrate_offline_chat_schema.py
@@ -0,0 +1,83 @@
+"""
+Current format of khoj.yml
+---
+app:
+    ...
+content-type:
+    ...
+processor:
+  conversation:
+    enable-offline-chat: false
+    conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
+    openai:
+        ...
+search-type:
+    ...
+
+New format of khoj.yml
+---
+app:
+    ...
+content-type:
+    ...
+processor:
+  conversation:
+    offline-chat:
+        enable-offline-chat: false
+        chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
+    tokenizer: null
+    max_prompt_size: null
+    conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
+    openai:
+        ...
+search-type:
+    ...
+"""
+import logging
+from packaging import version
+
+from khoj.utils.yaml import load_config_from_file, save_config_to_file
+
+
+logger = logging.getLogger(__name__)
+
+
+def migrate_offline_chat_schema(args):
+    schema_version = "0.12.3"
+    raw_config = load_config_from_file(args.config_file)
+    previous_version = raw_config.get("version")
+
+    if "processor" not in raw_config:
+        return args
+    if raw_config["processor"] is None:
+        return args
+    if "conversation" not in raw_config["processor"]:
+        return args
+
+    if previous_version is None or version.parse(previous_version) < version.parse("0.12.3"):
+        logger.info(
+            f"Upgrading config schema to {schema_version} from {previous_version} to make (offline) chat more configuration"
+        )
+        raw_config["version"] = schema_version
+
+        # Create max-prompt-size field in conversation processor schema
+        raw_config["processor"]["conversation"]["max-prompt-size"] = None
+        raw_config["processor"]["conversation"]["tokenizer"] = None
+
+        # Create offline chat schema based on existing enable_offline_chat field in khoj config schema
+        offline_chat_model = (
+            raw_config["processor"]["conversation"]
+            .get("offline-chat", {})
+            .get("chat-model", "llama-2-7b-chat.ggmlv3.q4_0.bin")
+        )
+        raw_config["processor"]["conversation"]["offline-chat"] = {
+            "enable-offline-chat": raw_config["processor"]["conversation"].get("enable-offline-chat", False),
+            "chat-model": offline_chat_model,
+        }
+
+        # Delete old enable-offline-chat field from conversation processor schema
+        if "enable-offline-chat" in raw_config["processor"]["conversation"]:
+            del raw_config["processor"]["conversation"]["enable-offline-chat"]
+
+        save_config_to_file(raw_config, args.config_file)
+    return args
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 2ff6bab0..91db7c58 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -284,7 +284,7 @@ if not state.demo:
         except Exception as e:
             return {"status": "error", "message": str(e)}
 
-    @api.post("/config/data/processor/conversation/enable_offline_chat", status_code=200)
+    @api.post("/config/data/processor/conversation/offline_chat", status_code=200)
     async def set_processor_enable_offline_chat_config_data(
         request: Request,
         enable_offline_chat: bool,
@@ -301,7 +301,7 @@ if not state.demo:
             state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile))  # type: ignore
 
         assert state.config.processor.conversation is not None
-        state.config.processor.conversation.enable_offline_chat = enable_offline_chat
+        state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat
         state.processor_config = configure_processor(state.config.processor, state.processor_config)
 
         update_telemetry_state(
@@ -707,7 +707,7 @@ async def chat(
     )
     conversation_command = get_conversation_command(query=q, any_references=not is_none_or_empty(compiled_references))
     if conversation_command == ConversationCommand.Help:
-        model_type = "offline" if state.processor_config.conversation.enable_offline_chat else "openai"
+        model_type = "offline" if state.processor_config.conversation.offline_chat.enable_offline_chat else "openai"
         formatted_help = help_message.format(model=model_type, version=state.khoj_version)
         return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200)
 
@@ -784,7 +784,7 @@ async def extract_references_and_questions(
     # Infer search queries from user message
     with timer("Extracting search queries took", logger):
         # If we've reached here, either the user has enabled offline chat or the openai model is enabled.
-        if state.processor_config.conversation.enable_offline_chat:
+        if state.processor_config.conversation.offline_chat.enable_offline_chat:
             loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
             inferred_queries = extract_questions_offline(
                 defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False
@@ -800,7 +800,7 @@ async def extract_references_and_questions(
     with timer("Searching knowledge base took", logger):
         result_list = []
         for query in inferred_queries:
-            n_items = min(n, 3) if state.processor_config.conversation.enable_offline_chat else n
+            n_items = min(n, 3) if state.processor_config.conversation.offline_chat.enable_offline_chat else n
             result_list.extend(
                 await search(
                     f"{query} {filters_in_query}",
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index 3898d1b8..0bc66991 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -113,7 +113,7 @@ def generate_chat_response(
             meta_log=meta_log,
         )
 
-        if state.processor_config.conversation.enable_offline_chat:
+        if state.processor_config.conversation.offline_chat.enable_offline_chat:
             loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
             chat_response = converse_offline(
                 references=compiled_references,
diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py
index 78a9ccf9..1d6106cb 100644
--- a/src/khoj/utils/cli.py
+++ b/src/khoj/utils/cli.py
@@ -9,6 +9,7 @@ from khoj.utils.yaml import parse_config_from_file
 from khoj.migrations.migrate_version import migrate_config_to_version
 from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
 from khoj.migrations.migrate_offline_model import migrate_offline_model
+from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
 
 
 def cli(args=None):
@@ -55,7 +56,12 @@ def cli(args=None):
 
 
 def run_migrations(args):
-    migrations = [migrate_config_to_version, migrate_processor_conversation_schema, migrate_offline_model]
+    migrations = [
+        migrate_config_to_version,
+        migrate_processor_conversation_schema,
+        migrate_offline_model,
+        migrate_offline_chat_schema,
+    ]
     for migration in migrations:
         args = migration(args)
     return args
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index 5accd2ad..90e8862a 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -96,18 +96,18 @@ class ConversationProcessorConfigModel:
         self.openai_model = conversation_config.openai
         self.gpt4all_model = GPT4AllProcessorConfig()
         self.gpt4all_model.chat_model = conversation_config.offline_chat_model
-        self.enable_offline_chat = conversation_config.enable_offline_chat
+        self.offline_chat = conversation_config.offline_chat
         self.conversation_logfile = Path(conversation_config.conversation_logfile)
         self.chat_session: List[str] = []
         self.meta_log: dict = {}
 
-        if self.enable_offline_chat:
+        if self.offline_chat.enable_offline_chat:
             try:
                 self.gpt4all_model.loaded_model = download_model(self.gpt4all_model.chat_model)
             except ValueError as e:
+                self.offline_chat.enable_offline_chat = False
                 self.gpt4all_model.loaded_model = None
                 logger.error(f"Error while loading offline chat model: {e}", exc_info=True)
-                self.enable_offline_chat = False
         else:
             self.gpt4all_model.loaded_model = None
 
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index 30a98354..f7c42266 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -91,11 +91,17 @@ class OpenAIProcessorConfig(ConfigBase):
     chat_model: Optional[str] = "gpt-3.5-turbo"
 
 
+class OfflineChatProcessorConfig(ConfigBase):
+    enable_offline_chat: Optional[bool] = False
+    chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+
+
 class ConversationProcessorConfig(ConfigBase):
     conversation_logfile: Path
     openai: Optional[OpenAIProcessorConfig]
-    enable_offline_chat: Optional[bool] = False
-    offline_chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+    offline_chat: Optional[OfflineChatProcessorConfig]
+    max_prompt_size: Optional[int]
+    tokenizer: Optional[str]
 
 
 class ProcessorConfig(ConfigBase):
diff --git a/tests/conftest.py b/tests/conftest.py
index d851341d..f75dfceb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -16,6 +16,7 @@ from khoj.utils.helpers import resolve_absolute_path
 from khoj.utils.rawconfig import (
     ContentConfig,
     ConversationProcessorConfig,
+    OfflineChatProcessorConfig,
     OpenAIProcessorConfig,
     ProcessorConfig,
     TextContentConfig,
@@ -205,8 +206,9 @@ def processor_config_offline_chat(tmp_path_factory):
 
     # Setup conversation processor
     processor_config = ProcessorConfig()
+    offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
     processor_config.conversation = ConversationProcessorConfig(
-        enable_offline_chat=True,
+        offline_chat=offline_chat,
         conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
     )
 

From 116595b351d1dfeeaaa7399d25cbb32c064eeafa Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 15 Oct 2023 14:24:28 -0700
Subject: [PATCH 27/62] Use chat_model specified in new offline_chat section of
 config

- Dedupe offline_chat_model variable. Only reference offline chat
  model stored under offline_chat. Delete the previous chat_model
  field under GPT4AllProcessorConfig

- Set offline chat model to use via config/offline_chat API endpoint
---
 src/khoj/routers/api.py     | 3 +++
 src/khoj/routers/helpers.py | 2 +-
 src/khoj/utils/config.py    | 4 +---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 91db7c58..8dc0a37e 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -288,6 +288,7 @@ if not state.demo:
     async def set_processor_enable_offline_chat_config_data(
         request: Request,
         enable_offline_chat: bool,
+        offline_chat_model: Optional[str] = None,
         client: Optional[str] = None,
     ):
         _initialize_config()
@@ -302,6 +303,8 @@ if not state.demo:
 
         assert state.config.processor.conversation is not None
         state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat
+        if offline_chat_model is not None:
+            state.config.processor.conversation.offline_chat.chat_model = offline_chat_model
         state.processor_config = configure_processor(state.config.processor, state.processor_config)
 
         update_telemetry_state(
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index 0bc66991..d8b0aa8b 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -122,7 +122,7 @@ def generate_chat_response(
                 conversation_log=meta_log,
                 completion_func=partial_completion,
                 conversation_command=conversation_command,
-                model=state.processor_config.conversation.gpt4all_model.chat_model,
+                model=state.processor_config.conversation.offline_chat.chat_model,
             )
 
         elif state.processor_config.conversation.openai_model:
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index 90e8862a..daae1982 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -84,7 +84,6 @@ class SearchModels:
 
 @dataclass
 class GPT4AllProcessorConfig:
-    chat_model: Optional[str] = None
     loaded_model: Union[Any, None] = None
 
 
@@ -95,7 +94,6 @@ class ConversationProcessorConfigModel:
     ):
         self.openai_model = conversation_config.openai
         self.gpt4all_model = GPT4AllProcessorConfig()
-        self.gpt4all_model.chat_model = conversation_config.offline_chat_model
         self.offline_chat = conversation_config.offline_chat
         self.conversation_logfile = Path(conversation_config.conversation_logfile)
         self.chat_session: List[str] = []
@@ -103,7 +101,7 @@ class ConversationProcessorConfigModel:
 
         if self.offline_chat.enable_offline_chat:
             try:
-                self.gpt4all_model.loaded_model = download_model(self.gpt4all_model.chat_model)
+                self.gpt4all_model.loaded_model = download_model(self.offline_chat.chat_model)
             except ValueError as e:
                 self.offline_chat.enable_offline_chat = False
                 self.gpt4all_model.loaded_model = None

From df1d74a879d5b62ab983bcbba8d9bee1c5fce03f Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 15 Oct 2023 16:33:26 -0700
Subject: [PATCH 28/62] Use max_prompt_size, tokenizer from config for chat
 model context stuffing

---
 .../conversation/gpt4all/chat_model.py        |  4 ++
 src/khoj/processor/conversation/openai/gpt.py |  4 ++
 src/khoj/processor/conversation/utils.py      | 45 ++++++++++++++-----
 src/khoj/routers/helpers.py                   |  4 ++
 src/khoj/utils/config.py                      |  2 +
 5 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py
index e9beaa80..7e92d002 100644
--- a/src/khoj/processor/conversation/gpt4all/chat_model.py
+++ b/src/khoj/processor/conversation/gpt4all/chat_model.py
@@ -127,6 +127,8 @@ def converse_offline(
     loaded_model: Union[Any, None] = None,
     completion_func=None,
     conversation_command=ConversationCommand.Default,
+    max_prompt_size=None,
+    tokenizer_name=None,
 ) -> Union[ThreadedGenerator, Iterator[str]]:
     """
     Converse with user using Llama
@@ -158,6 +160,8 @@ def converse_offline(
         prompts.system_prompt_message_llamav2,
         conversation_log,
         model_name=model,
+        max_prompt_size=max_prompt_size,
+        tokenizer_name=tokenizer_name,
     )
 
     g = ThreadedGenerator(references, completion_func=completion_func)
diff --git a/src/khoj/processor/conversation/openai/gpt.py b/src/khoj/processor/conversation/openai/gpt.py
index 96510586..73b4f176 100644
--- a/src/khoj/processor/conversation/openai/gpt.py
+++ b/src/khoj/processor/conversation/openai/gpt.py
@@ -116,6 +116,8 @@ def converse(
     temperature: float = 0.2,
     completion_func=None,
     conversation_command=ConversationCommand.Default,
+    max_prompt_size=None,
+    tokenizer_name=None,
 ):
     """
     Converse with user using OpenAI's ChatGPT
@@ -141,6 +143,8 @@ def converse(
         prompts.personality.format(),
         conversation_log,
         model,
+        max_prompt_size,
+        tokenizer_name,
     )
     truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
     logger.debug(f"Conversation Context for GPT: {truncated_messages}")
diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index 7bb86887..5f219b83 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -13,17 +13,16 @@ from transformers import AutoTokenizer
 import queue
 from khoj.utils.helpers import merge_dicts
 
+
 logger = logging.getLogger(__name__)
-max_prompt_size = {
+model_to_prompt_size = {
     "gpt-3.5-turbo": 4096,
     "gpt-4": 8192,
     "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
     "gpt-3.5-turbo-16k": 15000,
-    "default": 1600,
 }
-tokenizer = {
+model_to_tokenizer = {
     "llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
-    "default": "hf-internal-testing/llama-tokenizer",
 }
 
 
@@ -86,7 +85,13 @@ def message_to_log(
 
 
 def generate_chatml_messages_with_context(
-    user_message, system_message, conversation_log={}, model_name="gpt-3.5-turbo", lookback_turns=2
+    user_message,
+    system_message,
+    conversation_log={},
+    model_name="gpt-3.5-turbo",
+    lookback_turns=2,
+    max_prompt_size=None,
+    tokenizer_name=None,
 ):
     """Generate messages for ChatGPT with context from previous conversation"""
     # Extract Chat History for Context
@@ -108,20 +113,38 @@ def generate_chatml_messages_with_context(
 
     messages = user_chatml_message + rest_backnforths + system_chatml_message
 
+    # Set max prompt size from user config, pre-configured for model or to default prompt size
+    try:
+        max_prompt_size = max_prompt_size or model_to_prompt_size[model_name]
+    except:
+        max_prompt_size = 2000
+        logger.warning(
+            f"Fallback to default prompt size: {max_prompt_size}.\nConfigure max_prompt_size for unsupported model: {model_name} in Khoj settings to longer context window."
+        )
+
     # Truncate oldest messages from conversation history until under max supported prompt size by model
-    messages = truncate_messages(messages, max_prompt_size.get(model_name, max_prompt_size["default"]), model_name)
+    messages = truncate_messages(messages, max_prompt_size, model_name, tokenizer_name)
 
     # Return message in chronological order
     return messages[::-1]
 
 
-def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name: str) -> list[ChatMessage]:
+def truncate_messages(
+    messages: list[ChatMessage], max_prompt_size, model_name: str, tokenizer_name=None
+) -> list[ChatMessage]:
     """Truncate messages to fit within max prompt size supported by model"""
 
-    if model_name.startswith("gpt-"):
-        encoder = tiktoken.encoding_for_model(model_name)
-    else:
-        encoder = AutoTokenizer.from_pretrained(tokenizer.get(model_name, tokenizer["default"]))
+    try:
+        if model_name.startswith("gpt-"):
+            encoder = tiktoken.encoding_for_model(model_name)
+        else:
+            encoder = AutoTokenizer.from_pretrained(tokenizer_name or model_to_tokenizer[model_name])
+    except:
+        default_tokenizer = "hf-internal-testing/llama-tokenizer"
+        encoder = AutoTokenizer.from_pretrained(default_tokenizer)
+        logger.warning(
+            f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing."
+        )
 
     system_message = messages.pop()
     system_message_tokens = len(encoder.encode(system_message.content))
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index d8b0aa8b..6b42f29c 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -123,6 +123,8 @@ def generate_chat_response(
                 completion_func=partial_completion,
                 conversation_command=conversation_command,
                 model=state.processor_config.conversation.offline_chat.chat_model,
+                max_prompt_size=state.processor_config.conversation.max_prompt_size,
+                tokenizer_name=state.processor_config.conversation.tokenizer,
             )
 
         elif state.processor_config.conversation.openai_model:
@@ -136,6 +138,8 @@ def generate_chat_response(
                 api_key=api_key,
                 completion_func=partial_completion,
                 conversation_command=conversation_command,
+                max_prompt_size=state.processor_config.conversation.max_prompt_size,
+                tokenizer_name=state.processor_config.conversation.tokenizer,
             )
 
     except Exception as e:
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index daae1982..3930ec98 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -95,6 +95,8 @@ class ConversationProcessorConfigModel:
         self.openai_model = conversation_config.openai
         self.gpt4all_model = GPT4AllProcessorConfig()
         self.offline_chat = conversation_config.offline_chat
+        self.max_prompt_size = conversation_config.max_prompt_size
+        self.tokenizer = conversation_config.tokenizer
         self.conversation_logfile = Path(conversation_config.conversation_logfile)
         self.chat_session: List[str] = []
         self.meta_log: dict = {}

From 1a9023d3968e9e7ae079dbcf6ee0105209f8d621 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 15 Oct 2023 17:22:44 -0700
Subject: [PATCH 29/62] Update Chat Actor test to not incept with prior world
 knowledge

---
 tests/test_gpt4all_chat_actors.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py
index 056618be..76ed26e7 100644
--- a/tests/test_gpt4all_chat_actors.py
+++ b/tests/test_gpt4all_chat_actors.py
@@ -145,7 +145,7 @@ def test_extract_multiple_implicit_questions_from_message(loaded_model):
 def test_generate_search_query_using_question_from_chat_history(loaded_model):
     # Arrange
     message_list = [
-        ("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
+        ("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
     ]
 
     # Act
@@ -156,17 +156,22 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
         use_history=True,
     )
 
-    expected_responses = [
-        "Vader",
-        "sons",
+    all_expected_in_response = [
+        "Anderson",
+    ]
+
+    any_expected_in_response = [
         "son",
-        "Darth",
+        "sons",
         "children",
     ]
 
     # Assert
     assert len(response) >= 1
-    assert any([expected_response in response[0] for expected_response in expected_responses]), (
+    assert all([expected_response in response[0] for expected_response in all_expected_in_response]), (
+        "Expected chat actor to ask for clarification in response, but got: " + response[0]
+    )
+    assert any([expected_response in response[0] for expected_response in any_expected_in_response]), (
         "Expected chat actor to ask for clarification in response, but got: " + response[0]
     )
 
@@ -176,20 +181,20 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
 def test_generate_search_query_using_answer_from_chat_history(loaded_model):
     # Arrange
     message_list = [
-        ("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
+        ("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
     ]
 
     # Act
     response = extract_questions_offline(
-        "Is she a Jedi?",
+        "Is she a Doctor?",
         conversation_log=populate_chat_history(message_list),
         loaded_model=loaded_model,
         use_history=True,
     )
 
     expected_responses = [
-        "Leia",
-        "Vader",
+        "Barbara",
+        "Robert",
         "daughter",
     ]
 

From 90e1d9e3d685f4f6c54835f5092c88c6a252b61e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 16 Oct 2023 10:57:16 -0700
Subject: [PATCH 30/62] Pin gpt4all to 1.0.12 as next version will introduce
 breaking changes

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e6773b88..a52fc9b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,8 +59,8 @@ dependencies = [
     "bs4 >= 0.0.1",
     "anyio == 3.7.1",
     "pymupdf >= 1.23.3",
-    "gpt4all >= 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
-    "gpt4all >= 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
+    "gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
 ]
 dynamic = ["version"]
 

From 644c3b787f12bbc2d3f4814bd4afc5fd82c9e099 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 16 Oct 2023 11:15:38 -0700
Subject: [PATCH 31/62] Scale no. of chat history messages to use as context
 with max_prompt_size

Previously lookback turns was set to a static 2. But now that we
support more chat models, their prompt size vary considerably.

Make lookback_turns proportional to max_prompt_size. The truncate_messages
can remove messages if they exceed max_prompt_size later

This lets Khoj pass more of the chat history as context for models
with larger context window
---
 src/khoj/processor/conversation/utils.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index 5f219b83..83d51f2d 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -3,6 +3,7 @@ import logging
 from time import perf_counter
 import json
 from datetime import datetime
+import queue
 import tiktoken
 
 # External packages
@@ -10,7 +11,6 @@ from langchain.schema import ChatMessage
 from transformers import AutoTokenizer
 
 # Internal Packages
-import queue
 from khoj.utils.helpers import merge_dicts
 
 
@@ -89,11 +89,22 @@ def generate_chatml_messages_with_context(
     system_message,
     conversation_log={},
     model_name="gpt-3.5-turbo",
-    lookback_turns=2,
     max_prompt_size=None,
     tokenizer_name=None,
 ):
     """Generate messages for ChatGPT with context from previous conversation"""
+    # Set max prompt size from user config, pre-configured for model or to default prompt size
+    try:
+        max_prompt_size = max_prompt_size or model_to_prompt_size[model_name]
+    except:
+        max_prompt_size = 2000
+        logger.warning(
+            f"Fallback to default prompt size: {max_prompt_size}.\nConfigure max_prompt_size for unsupported model: {model_name} in Khoj settings to longer context window."
+        )
+
+    # Scale lookback turns proportional to max prompt size supported by model
+    lookback_turns = max_prompt_size // 750
+
     # Extract Chat History for Context
     chat_logs = []
     for chat in conversation_log.get("chat", []):
@@ -113,15 +124,6 @@ def generate_chatml_messages_with_context(
 
     messages = user_chatml_message + rest_backnforths + system_chatml_message
 
-    # Set max prompt size from user config, pre-configured for model or to default prompt size
-    try:
-        max_prompt_size = max_prompt_size or model_to_prompt_size[model_name]
-    except:
-        max_prompt_size = 2000
-        logger.warning(
-            f"Fallback to default prompt size: {max_prompt_size}.\nConfigure max_prompt_size for unsupported model: {model_name} in Khoj settings to longer context window."
-        )
-
     # Truncate oldest messages from conversation history until under max supported prompt size by model
     messages = truncate_messages(messages, max_prompt_size, model_name, tokenizer_name)
 

From f64fa06e2278a6ea64d1054163842d2001661e8d Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 13 Oct 2023 18:48:26 -0700
Subject: [PATCH 32/62] Initialize the Khoj Transient menu on first run instead
 of load

This prevents Khoj from polling the Khoj server until explicitly
invoked via `khoj' entrypoint function.

Previously it'd make a request to the khoj server every time Emacs or
khoj.el was loaded

Closes #243
---
 src/interface/emacs/khoj.el | 92 ++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index 1e7f9032..f8389874 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -1092,17 +1092,20 @@ Paragraph only starts at first text after blank line."
 ;; Khoj Menu
 ;; ---------
 
-(transient-define-argument khoj--content-type-switch ()
-  :class 'transient-switches
-  :argument-format "--content-type=%s"
-  :argument-regexp ".+"
-  ;; set content type to: last used > based on current buffer > default type
-  :init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
-  ;; dynamically set choices to content types enabled on khoj backend
-  :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
+(defun khoj--setup-and-show-menu ()
+  "Create Transient menu for khoj and show it."
+  ;; Create the Khoj Transient menu
+  (transient-define-argument khoj--content-type-switch ()
+    :class 'transient-switches
+    :argument-format "--content-type=%s"
+    :argument-regexp ".+"
+    ;; set content type to: last used > based on current buffer > default type
+    :init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
+    ;; dynamically set choices to content types enabled on khoj backend
+    :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
 
-(transient-define-suffix khoj--search-command (&optional args)
-  (interactive (list (transient-args transient-current-command)))
+  (transient-define-suffix khoj--search-command (&optional args)
+    (interactive (list (transient-args transient-current-command)))
     (progn
       ;; set content type to: specified > last used > based on current buffer > default type
       (setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
@@ -1111,9 +1114,9 @@ Paragraph only starts at first text after blank line."
       ;; trigger incremental search
       (call-interactively #'khoj-incremental)))
 
-(transient-define-suffix khoj--find-similar-command (&optional args)
-  "Find items similar to current item at point."
-  (interactive (list (transient-args transient-current-command)))
+  (transient-define-suffix khoj--find-similar-command (&optional args)
+    "Find items similar to current item at point."
+    (interactive (list (transient-args transient-current-command)))
     (progn
       ;; set content type to: specified > last used > based on current buffer > default type
       (setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
@@ -1121,37 +1124,40 @@ Paragraph only starts at first text after blank line."
       (setq khoj-results-count (or (transient-arg-value "--results-count=" args) khoj-results-count))
       (khoj--find-similar khoj--content-type)))
 
-(transient-define-suffix khoj--update-command (&optional args)
-  "Call khoj API to update index of specified content type."
-  (interactive (list (transient-args transient-current-command)))
-  (let* ((force-update (if (member "--force-update" args) "true" "false"))
-         ;; set content type to: specified > last used > based on current buffer > default type
-         (content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
-         (type-query (if (equal content-type "all") "" (format "t=%s" content-type)))
-         (update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update))
-         (url-request-method "GET"))
-    (progn
-      (setq khoj--content-type content-type)
-      (url-retrieve update-url (lambda (_) (message "khoj.el: %s index %supdated!" content-type (if (member "--force-update" args) "force " "")))))))
+  (transient-define-suffix khoj--update-command (&optional args)
+    "Call khoj API to update index of specified content type."
+    (interactive (list (transient-args transient-current-command)))
+    (let* ((force-update (if (member "--force-update" args) "true" "false"))
+           ;; set content type to: specified > last used > based on current buffer > default type
+           (content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
+           (type-query (if (equal content-type "all") "" (format "t=%s" content-type)))
+           (update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update))
+           (url-request-method "GET"))
+      (progn
+        (setq khoj--content-type content-type)
+        (url-retrieve update-url (lambda (_) (message "khoj.el: %s index %supdated!" content-type (if (member "--force-update" args) "force " "")))))))
 
-(transient-define-suffix khoj--chat-command (&optional _)
-  "Command to Chat with Khoj."
-  (interactive (list (transient-args transient-current-command)))
-  (khoj--chat))
+  (transient-define-suffix khoj--chat-command (&optional _)
+    "Command to Chat with Khoj."
+    (interactive (list (transient-args transient-current-command)))
+    (khoj--chat))
 
-(transient-define-prefix khoj--menu ()
-  "Create Khoj Menu to Configure and Execute Commands."
-  [["Configure Search"
-    ("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
-    ("t" "Content Type" khoj--content-type-switch)]
-   ["Configure Update"
-    ("-f" "Force Update" "--force-update")]]
-  [["Act"
-    ("c" "Chat" khoj--chat-command)
-    ("s" "Search" khoj--search-command)
-    ("f" "Find Similar" khoj--find-similar-command)
-    ("u" "Update" khoj--update-command)
-    ("q" "Quit" transient-quit-one)]])
+  (transient-define-prefix khoj--menu ()
+    "Create Khoj Menu to Configure and Execute Commands."
+    [["Configure Search"
+      ("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
+      ("t" "Content Type" khoj--content-type-switch)]
+     ["Configure Update"
+      ("-f" "Force Update" "--force-update")]]
+    [["Act"
+      ("c" "Chat" khoj--chat-command)
+      ("s" "Search" khoj--search-command)
+      ("f" "Find Similar" khoj--find-similar-command)
+      ("u" "Update" khoj--update-command)
+      ("q" "Quit" transient-quit-one)]])
+
+  ;; Show the Khoj Transient menu
+  (khoj--menu))
 
 
 ;; ----------
@@ -1164,7 +1170,7 @@ Paragraph only starts at first text after blank line."
   (interactive)
   (when khoj-auto-setup
     (khoj-setup t))
-  (khoj--menu))
+  (khoj--setup-and-show-menu))
 
 (provide 'khoj)
 

From 5dc399b32e676c7a2049cab53d4a608b4eb0158b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 16 Oct 2023 19:39:06 -0700
Subject: [PATCH 33/62] Document system requirements to run offline chat

Closes #375
---
 docs/chat.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/docs/chat.md b/docs/chat.md
index eb3a2f0f..eeca3132 100644
--- a/docs/chat.md
+++ b/docs/chat.md
@@ -7,18 +7,21 @@
 
 ### Setup
 #### Offline Chat
-Offline chat works without internet but it is slower, lower quality and more compute intensive.
+Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive.
 
-!> **Warning**: This will download a 3Gb+ Llama v2 chat model which can take some time
+> **System Requirements**:
+>  - You need at least **16 GB of RAM** and **4 GB of Disk**
+>  - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required
+>  - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times
 
-- Open your [Khoj settings](http://localhost:42110/config/), click *Enable* on the Offline Chat card
+- Open your [Khoj settings](http://localhost:42110/config/) and click *Enable* on the Offline Chat card
 
 ![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4')
 
 #### Online Chat
 Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive.
 
-!> **Warning**: This will enable Khoj to send your chat queries and notes to OpenAI for processing
+!> **Warning**: This will enable Khoj to send your chat queries and query relevant notes to OpenAI for processing
 
 1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys)
 2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key.

From 79b3f8273afb09a7ba0b9322173d29d43e377289 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 16 Oct 2023 23:53:02 -0700
Subject: [PATCH 34/62] Make khoj.el send files to be deleted from index to
 server

---
 src/interface/emacs/khoj.el | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index f8389874..2956c025 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -136,6 +136,9 @@
 (defvar khoj--index-timer nil
   "Timer to trigger content indexing.")
 
+(defvar khoj--indexed-files '()
+  "Files that were indexed in previous content indexing run.")
+
 (declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
 (declare-function org-element-type "org-mode" (ELEMENT))
 (declare-function markdown-mode "markdown-mode" ())
@@ -543,7 +546,7 @@ CONFIG is json obtained from Khoj config API."
         (inhibit-message t)
         (message-log-max nil))
     (let ((url-request-method "POST")
-          (url-request-data (khoj--render-files-as-request-body files-to-index boundary))
+          (url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary))
           (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
                                        ("x-api-key" . ,khoj-server-api-key))))
       (with-current-buffer
@@ -555,11 +558,12 @@ CONFIG is json obtained from Khoj config API."
                             (with-current-buffer (current-buffer)
                               (goto-char "\n\n")
                               (message "khoj.el: Failed to update Content Index. Status: %s. Response: %s" status (string-trim (buffer-substring-no-properties (point) (point-max)))))))
-                        nil t t)))))
+                        nil t t)))
+    (setq khoj--indexed-files files-to-index)))
 
-(defun khoj--render-files-as-request-body (files-to-index boundary)
-  "Render `FILES-TO-INDEX' as multi-part form body using `BOUNDARY'.
-This is sent to Khoj server as a POST request."
+(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary)
+  "Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body.
+Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request."
   (with-temp-buffer
     (set-buffer-multibyte nil)
     (insert "\n")
@@ -571,6 +575,13 @@ This is sent to Khoj server as a POST request."
                 (insert-file-contents-literally file-to-index)
                 (buffer-string)))
       (insert "\r\n"))
+    (dolist (file-to-index previously-indexed-files)
+      (when (not (member file-to-index files-to-index))
+        (insert (format "--%s\r\n" boundary))
+        (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
+        (insert "Content-Type: text/org\r\n\r\n")
+        (insert "")
+        (insert "\r\n")))
     (insert (format "--%s--\r\n" boundary))
     (buffer-string)))
 

From 6baaaaf91a76a28667a223cc6c2fec3399bd554e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 16 Oct 2023 23:54:32 -0700
Subject: [PATCH 35/62] Test request body of multi-part form to update content
 index from khoj.el

---
 src/interface/emacs/tests/khoj-tests.el | 58 +++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/src/interface/emacs/tests/khoj-tests.el b/src/interface/emacs/tests/khoj-tests.el
index 8242d30b..c0d9f4a6 100644
--- a/src/interface/emacs/tests/khoj-tests.el
+++ b/src/interface/emacs/tests/khoj-tests.el
@@ -206,6 +206,64 @@ Rule everything\n")
       "Rule everything"))
     ))
 
+
+;; -------------------------------------
+;; Test Helpers to Index Content
+;; -------------------------------------
+
+(ert-deftest khoj-tests--render-files-to-add-request-body ()
+  "Test files are formatted into a multi-part http request body"
+  (let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
+        (act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
+    (unwind-protect
+        (progn
+          (should
+           (equal
+            (khoj--render-files-as-request-body (list upgrade-file act-file) '() "khoj")
+            (format
+            "\n--khoj\r\n\
+Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
+Content-Type: text/org\r\n\r\n\
+# Become God\n\
+## Upgrade\n\n\
+Penance to Immortality\n\n\r
+--khoj\r\n\
+Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
+Content-Type: text/org\r\n\r\n\
+## Act\n\n\
+Rule everything\n\n\r\n\
+--khoj--\r\n" upgrade-file act-file))))
+      (delete-file upgrade-file)
+      (delete-file act-file))))
+
+(ert-deftest khoj-tests--render-files-to-add-delete-in-request-body ()
+  "Test files are formatted into a multi-part http request body"
+  (let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
+        (act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
+    (unwind-protect
+        (progn
+          (should
+           (equal
+            (khoj--render-files-as-request-body (list upgrade-file act-file) (list upgrade-file act-file "/tmp/deleted-file.org") "khoj")
+            (format
+            "\n--khoj\r\n\
+Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
+Content-Type: text/org\r\n\r\n\
+# Become God\n\
+## Upgrade\n\n\
+Penance to Immortality\n\n\r
+--khoj\r\n\
+Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
+Content-Type: text/org\r\n\r\n\
+## Act\n\n\
+Rule everything\n\n\r
+--khoj\r\n\
+Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
+Content-Type: text/org\r\n\r\n\
+\r
+--khoj--\r\n" upgrade-file act-file "/tmp/deleted-file.org"))))
+      (delete-file upgrade-file)
+      (delete-file act-file))))
 
 (provide 'khoj-tests)
 

From f2e293a14905cbdd6af5d668ec5433c46acd4f2a Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 02:17:44 -0700
Subject: [PATCH 36/62] Push Vault files to index to Khoj server using Khoj
 Obsidian plugin

Use the multi-part/form-data request to sync Markdown, PDF files in
vault to index on khoj server

Run scheduled job to push updates to value for indexing every 1 hour
---
 src/interface/obsidian/src/main.ts  | 20 +++++++++--
 src/interface/obsidian/src/utils.ts | 54 ++++++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/src/interface/obsidian/src/main.ts b/src/interface/obsidian/src/main.ts
index 935945dd..65dac069 100644
--- a/src/interface/obsidian/src/main.ts
+++ b/src/interface/obsidian/src/main.ts
@@ -1,12 +1,13 @@
-import { Notice, Plugin } from 'obsidian';
+import { Notice, Plugin, TFile } from 'obsidian';
 import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
 import { KhojSearchModal } from 'src/search_modal'
 import { KhojChatModal } from 'src/chat_modal'
-import { configureKhojBackend } from './utils';
+import { configureKhojBackend, updateContentIndex } from './utils';
 
 
 export default class Khoj extends Plugin {
     settings: KhojSetting;
+    indexingTimer: NodeJS.Timeout;
 
     async onload() {
         await this.loadSettings();
@@ -54,6 +55,13 @@ export default class Khoj extends Plugin {
 
         // Add a settings tab so the user can configure khoj
         this.addSettingTab(new KhojSettingTab(this.app, this));
+
+        // Add scheduled job to update index every 60 minutes
+        this.indexingTimer = setInterval(async () => {
+            if (this.settings.autoConfigure) {
+                this.lastSyncedFiles = await updateContentIndex(this.app.vault, this.settings);
+            }
+        }, 60 * 60 * 1000);
     }
 
     async loadSettings() {
@@ -72,4 +80,12 @@ export default class Khoj extends Plugin {
         }
         this.saveData(this.settings);
     }
+
+    async onunload() {
+        // Remove scheduled job to update index at regular cadence
+        if (this.indexingTimer)
+            clearInterval(this.indexingTimer);
+
+        this.unload();
+    }
 }
diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts
index 920da583..1707703a 100644
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@@ -1,4 +1,4 @@
-import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian';
+import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian';
 import { KhojSetting } from 'src/settings'
 
 export function getVaultAbsolutePath(vault: Vault): string {
@@ -22,6 +22,58 @@ interface ProcessorData {
     };
 }
 
+function fileExtensionToMimeType (extension: string): string {
+    switch (extension) {
+        case 'pdf':
+            return 'application/pdf';
+        case 'png':
+            return 'image/png';
+        case 'jpg':
+        case 'jpeg':
+            return 'image/jpeg';
+        case 'md':
+        case 'markdown':
+            return 'text/markdown';
+        case 'org':
+            return 'text/org';
+        default:
+            return 'text/plain';
+    }
+}
+
+export async function updateContentIndex(vault: Vault, setting: KhojSetting): Promise<TFile[]> {
+    // Get all markdown, pdf files in the vault
+    console.log(`Khoj: Updating Khoj content index...`)
+    const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
+    const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
+
+    // Create multipart form data with all markdown, pdf files
+    const formData = new FormData();
+    for (const file of files) {
+        const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
+        const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
+        const fileContent = await vault.read(file);
+        formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
+    }
+
+    // Call Khoj backend to update index with all markdown, pdf files
+    const response = await fetch(`${setting.khojUrl}/api/v1/indexer/batch`, {
+        method: 'POST',
+        headers: {
+            'x-api-key': 'secret',
+        },
+        body: formData,
+    });
+
+    if (!response.ok) {
+        new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
+    } else {
+        console.log(`✅ Refreshed Khoj content index.`);
+    }
+
+    return files;
+}
+
 export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
     let vaultPath = getVaultAbsolutePath(vault);
     let mdInVault = `${vaultPath}/**/*.md`;

From 8e627a5809e2f996f5bbf6c7c37a4e7091a3fd0a Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 02:51:54 -0700
Subject: [PATCH 37/62] Pass any files to be deleted to indexer API via Khoj
 Obsidian plugin

- Keep state of previously synced files to identify files to be deleted
- Last synced files stored in settings for persistence of this data
  across Obsidian reboots
---
 src/interface/obsidian/src/main.ts     |  4 +++-
 src/interface/obsidian/src/settings.ts |  4 +++-
 src/interface/obsidian/src/utils.ts    | 17 ++++++++++++++---
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/interface/obsidian/src/main.ts b/src/interface/obsidian/src/main.ts
index 65dac069..1fbed55f 100644
--- a/src/interface/obsidian/src/main.ts
+++ b/src/interface/obsidian/src/main.ts
@@ -59,7 +59,9 @@ export default class Khoj extends Plugin {
         // Add scheduled job to update index every 60 minutes
         this.indexingTimer = setInterval(async () => {
             if (this.settings.autoConfigure) {
-                this.lastSyncedFiles = await updateContentIndex(this.app.vault, this.settings);
+                this.settings.lastSyncedFiles = await updateContentIndex(
+                    this.app.vault, this.settings, this.settings.lastSyncedFiles
+                );
             }
         }, 60 * 60 * 1000);
     }
diff --git a/src/interface/obsidian/src/settings.ts b/src/interface/obsidian/src/settings.ts
index c013f10c..dfb6e6bb 100644
--- a/src/interface/obsidian/src/settings.ts
+++ b/src/interface/obsidian/src/settings.ts
@@ -1,4 +1,4 @@
-import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian';
+import { App, Notice, PluginSettingTab, request, Setting, TFile } from 'obsidian';
 import Khoj from 'src/main';
 
 export interface KhojSetting {
@@ -8,6 +8,7 @@ export interface KhojSetting {
     khojUrl: string;
     connectedToBackend: boolean;
     autoConfigure: boolean;
+    lastSyncedFiles: TFile[];
 }
 
 export const DEFAULT_SETTINGS: KhojSetting = {
@@ -17,6 +18,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
     connectedToBackend: false,
     autoConfigure: true,
     openaiApiKey: '',
+    lastSyncedFiles: []
 }
 
 export class KhojSettingTab extends PluginSettingTab {
diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts
index 1707703a..9dba9fb9 100644
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@@ -41,21 +41,32 @@ function fileExtensionToMimeType (extension: string): string {
     }
 }
 
-export async function updateContentIndex(vault: Vault, setting: KhojSetting): Promise<TFile[]> {
+export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[]): Promise<TFile[]> {
     // Get all markdown, pdf files in the vault
     console.log(`Khoj: Updating Khoj content index...`)
     const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
     const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
+    let countOfFilesToIndex = 0;
+    let countOfFilesToDelete = 0;
 
-    // Create multipart form data with all markdown, pdf files
+    // Add all files to index as multipart form data
     const formData = new FormData();
     for (const file of files) {
+        countOfFilesToIndex++;
         const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
         const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
         const fileContent = await vault.read(file);
         formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
     }
 
+    // Add any previously synced files to be deleted to multipart form data
+    for (const lastSyncedFile of lastSyncedFiles) {
+        if (!files.includes(lastSyncedFile)) {
+            countOfFilesToDelete++;
+            formData.append('files', new Blob([]), lastSyncedFile.path);
+        }
+    }
+
     // Call Khoj backend to update index with all markdown, pdf files
     const response = await fetch(`${setting.khojUrl}/api/v1/indexer/batch`, {
         method: 'POST',
@@ -68,7 +79,7 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting): Pr
     if (!response.ok) {
         new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
     } else {
-        console.log(`✅ Refreshed Khoj content index.`);
+        console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
     }
 
     return files;

From d27dc71dfecf3f395a7200e7622ed6b7054543fc Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 02:37:20 -0700
Subject: [PATCH 38/62] Use encoding of each file set in indexer request to
 read file

Get encoding type from multi-part/form-request body for each file
Read text files as utf-8 and pdfs, images as binary
---
 src/interface/desktop/main.js |  2 +-
 src/khoj/routers/indexer.py   |  6 ++++--
 src/khoj/utils/helpers.py     | 17 +++++++++--------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js
index 62493f54..17ab2fb4 100644
--- a/src/interface/desktop/main.js
+++ b/src/interface/desktop/main.js
@@ -93,9 +93,9 @@ function filenameToMimeType (filename) {
         case 'png':
             return 'image/png';
         case 'jpg':
-            return 'image/jpeg';
         case 'jpeg':
             return 'image/jpeg';
+        case 'md':
         case 'markdown':
             return 'text/markdown';
         case 'org':
diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index 86cd847f..d94b8330 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -73,7 +73,7 @@ async def index_batch(
         plaintext_files: Dict[str, str] = {}
 
         for file in files:
-            file_type = get_file_type(file.content_type)
+            file_type, encoding = get_file_type(file.content_type)
             dict_to_update = None
             if file_type == "org":
                 dict_to_update = org_files
@@ -85,7 +85,9 @@ async def index_batch(
                 dict_to_update = plaintext_files
 
             if dict_to_update is not None:
-                dict_to_update[file.filename] = file.file.read().decode("utf-8")
+                dict_to_update[file.filename] = (
+                    file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
+                )
             else:
                 logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}")
 
diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py
index 3391a55d..9209ff67 100644
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -66,24 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
     return merged_dict
 
 
-def get_file_type(file_type: str) -> str:
+def get_file_type(file_type: str) -> tuple[str, str]:
     "Get file type from file mime type"
 
+    encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
     file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
     if file_type in ["text/markdown"]:
-        return "markdown"
+        return "markdown", encoding
     elif file_type in ["text/org"]:
-        return "org"
+        return "org", encoding
     elif file_type in ["application/pdf"]:
-        return "pdf"
+        return "pdf", encoding
     elif file_type in ["image/jpeg"]:
-        return "jpeg"
+        return "jpeg", encoding
     elif file_type in ["image/png"]:
-        return "png"
+        return "png", encoding
     elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
-        return "plaintext"
+        return "plaintext", encoding
     else:
-        return "other"
+        return "other", encoding
 
 
 def load_model(

From 541cd59a49ce841b696c5c4900c0fd1e96709007 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 02:41:16 -0700
Subject: [PATCH 39/62] Let fs_syncer pass PDF files directly as binary before
 indexing

No need to do unneeded base64 encoding/decoding to pass pdf contents
for indexing from fs_syncer to pdf_to_jsonl
---
 src/khoj/processor/pdf/pdf_to_jsonl.py | 2 +-
 src/khoj/utils/fs_syncer.py            | 2 +-
 tests/test_pdf_to_jsonl.py             | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py
index 77c34617..c24d9940 100644
--- a/src/khoj/processor/pdf/pdf_to_jsonl.py
+++ b/src/khoj/processor/pdf/pdf_to_jsonl.py
@@ -65,7 +65,7 @@ class PdfToJsonl(TextToJsonl):
                 # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path
                 tmp_file = f"tmp_pdf_file.pdf"
                 with open(f"{tmp_file}", "wb") as f:
-                    bytes = base64.b64decode(pdf_files[pdf_file])
+                    bytes = pdf_files[pdf_file]
                     f.write(bytes)
                 loader = PyMuPDFLoader(f"{tmp_file}")
                 pdf_entries_per_file = [page.page_content for page in loader.load()]
diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py
index d303d39b..4fab6d81 100644
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -210,7 +210,7 @@ def get_pdf_files(config: TextContentConfig):
     for file in all_pdf_files:
         with open(file, "rb") as f:
             try:
-                filename_to_content_map[file] = base64.b64encode(f.read()).decode("utf-8")
+                filename_to_content_map[file] = f.read()
             except Exception as e:
                 logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
                 logger.warning(e, exc_info=True)
diff --git a/tests/test_pdf_to_jsonl.py b/tests/test_pdf_to_jsonl.py
index bacce37c..b9b26986 100644
--- a/tests/test_pdf_to_jsonl.py
+++ b/tests/test_pdf_to_jsonl.py
@@ -1,7 +1,6 @@
 # Standard Packages
 import json
 import os
-import base64
 
 # Internal Packages
 from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
@@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
     # Extract Entries from specified Pdf files
     # Read singlepage.pdf into memory as bytes
     with open("tests/data/pdf/singlepage.pdf", "rb") as f:
-        pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
+        pdf_bytes = f.read()
 
     data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
     entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
@@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
     # Act
     # Extract Entries from specified Pdf files
     with open("tests/data/pdf/multipage.pdf", "rb") as f:
-        pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
+        pdf_bytes = f.read()
 
     data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
     entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)

From 99a2c934a3f98b0ea833ffe20d6d8a8ff820106d Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 02:54:18 -0700
Subject: [PATCH 40/62] Add CORS policy to allow requests from khoj apps,
 obsidian & localhost

Using fetch from Khoj Obsidian plugin was failing due to cross-origin
request and method: no-cors didn't allow passing x-api-key custom
header. And using Obsidian's request with multi-part/form-data wasn't
possible either.
---
 src/khoj/main.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/khoj/main.py b/src/khoj/main.py
index 6710ed05..7b1bfd7e 100644
--- a/src/khoj/main.py
+++ b/src/khoj/main.py
@@ -20,6 +20,7 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th
 # External Packages
 import uvicorn
 from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
 from rich.logging import RichHandler
 import schedule
 
@@ -31,6 +32,15 @@ from khoj.utils.cli import cli
 # Initialize the Application Server
 app = FastAPI()
 
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["app://obsidian.md", "http://localhost:*", "https://app.khoj.dev/*", "app://khoj.dev"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
 # Set Locale
 locale.setlocale(locale.LC_ALL, "")
 

From 13a3122bf3da89f53c5e7914814df61dc298ce82 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 03:23:25 -0700
Subject: [PATCH 41/62] Stop configuring server to pull files to index from
 Obsidian client

Obsidian client now pushes vault files to index instead
---
 src/interface/obsidian/src/utils.ts | 104 +---------------------------
 1 file changed, 2 insertions(+), 102 deletions(-)

diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts
index 9dba9fb9..7fb04d24 100644
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@@ -41,7 +41,7 @@ function fileExtensionToMimeType (extension: string): string {
     }
 }
 
-export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[]): Promise<TFile[]> {
+export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
     // Get all markdown, pdf files in the vault
     console.log(`Khoj: Updating Khoj content index...`)
     const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
@@ -68,7 +68,7 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
     }
 
     // Call Khoj backend to update index with all markdown, pdf files
-    const response = await fetch(`${setting.khojUrl}/api/v1/indexer/batch`, {
+    const response = await fetch(`${setting.khojUrl}/api/v1/indexer/batch?regenerate=${regenerate}`, {
         method: 'POST',
         headers: {
             'x-api-key': 'secret',
@@ -86,9 +86,6 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
 }
 
 export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
-    let vaultPath = getVaultAbsolutePath(vault);
-    let mdInVault = `${vaultPath}/**/*.md`;
-    let pdfInVault = `${vaultPath}/**/*.pdf`;
     let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
 
     // Check if khoj backend is configured, note if cannot connect to backend
@@ -106,11 +103,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
     if (!setting.connectedToBackend) return;
 
     // Set index name from the path of the current vault
-    let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
     // Get default config fields from khoj backend
     let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
-    let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
-    let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
     let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
     let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
 
@@ -118,99 +112,7 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
     await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
         .then(response => JSON.parse(response))
         .then(data => {
-            khoj_already_configured = data["content-type"] != null;
-            // If khoj backend not configured yet
-            if (!khoj_already_configured) {
-                // Create khoj content-type config with only markdown configured
-                data["content-type"] = {
-                    "markdown": {
-                        "input-filter": [mdInVault],
-                        "input-files": null,
-                        "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
-                        "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
-                    }
-                }
-
-                const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
-
-                if (hasPdfFiles) {
-                    data["content-type"]["pdf"] = {
-                        "input-filter": [pdfInVault],
-                        "input-files": null,
-                        "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
-                        "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
-                    }
-                }
-            }
-            // Else if khoj config has no markdown content config
-            else if (!data["content-type"]["markdown"]) {
-                // Add markdown config to khoj content-type config
-                // Set markdown config to index markdown files in configured obsidian vault
-                data["content-type"]["markdown"] = {
-                    "input-filter": [mdInVault],
-                    "input-files": null,
-                    "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
-                    "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
-                }
-            }
-            // Else if khoj is not configured to index markdown files in configured obsidian vault
-            else if (
-                data["content-type"]["markdown"]["input-files"] != null ||
-                data["content-type"]["markdown"]["input-filter"] == null ||
-                data["content-type"]["markdown"]["input-filter"].length != 1 ||
-                data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
-                    // Update markdown config in khoj content-type config
-                    // Set markdown config to only index markdown files in configured obsidian vault
-                    let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
-                    data["content-type"]["markdown"] = {
-                        "input-filter": [mdInVault],
-                        "input-files": null,
-                        "embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
-                        "compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
-                    }
-            }
-
-            if (khoj_already_configured && !data["content-type"]["pdf"]) {
-                const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
-
-                if (hasPdfFiles) {
-                    data["content-type"]["pdf"] = {
-                        "input-filter": [pdfInVault],
-                        "input-files": null,
-                        "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
-                        "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
-                    }
-                } else {
-                    data["content-type"]["pdf"] = null;
-                }
-            }
-            // Else if khoj is not configured to index pdf files in configured obsidian vault
-            else if (khoj_already_configured &&
-                (
-                    data["content-type"]["pdf"]["input-files"] != null ||
-                    data["content-type"]["pdf"]["input-filter"] == null ||
-                    data["content-type"]["pdf"]["input-filter"].length != 1 ||
-                    data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
-
-                let hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
-
-                if (hasPdfFiles) {
-                    // Update pdf config in khoj content-type config
-                    // Set pdf config to only index pdf files in configured obsidian vault
-                    let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
-                    data["content-type"]["pdf"] = {
-                        "input-filter": [pdfInVault],
-                        "input-files": null,
-                        "embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
-                        "compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
-                    }
-                } else {
-                    data["content-type"]["pdf"] = null;
-                }
-            }
-
             let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`;
-
             let processorData: ProcessorData = {
                 "conversation": {
                     "conversation-logfile": conversationLogFile,
@@ -221,9 +123,7 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
 
             // If the Open AI API Key was configured in the plugin settings
             if (!!setting.openaiApiKey) {
-
                 let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName;
-
                 processorData = {
                     "conversation": {
                         "conversation-logfile": conversationLogFile,

From 05be6bd877789515d3f0cb6b6a0331e00399a65c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 03:27:41 -0700
Subject: [PATCH 42/62] Clicking Update Index in Obsidian settings should push
 files to index

Use the indexer/batch API endpoint to regenerate content index rather
than the previous pull based content indexing API endpoint
---
 src/interface/obsidian/src/settings.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/interface/obsidian/src/settings.ts b/src/interface/obsidian/src/settings.ts
index dfb6e6bb..9b672659 100644
--- a/src/interface/obsidian/src/settings.ts
+++ b/src/interface/obsidian/src/settings.ts
@@ -1,5 +1,6 @@
 import { App, Notice, PluginSettingTab, request, Setting, TFile } from 'obsidian';
 import Khoj from 'src/main';
+import { updateContentIndex } from './utils';
 
 export interface KhojSetting {
     enableOfflineChat: boolean;
@@ -120,8 +121,9 @@ export class KhojSettingTab extends PluginSettingTab {
                     }, 300);
                     this.plugin.registerInterval(progress_indicator);
 
-                    await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true&client=obsidian`);
-                    await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true&client=obsidian`);
+                    this.plugin.settings.lastSyncedFiles = await updateContentIndex(
+                        this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
+                    );
                     new Notice('✅ Updated Khoj index.');
 
                     // Reset button once index is updated

From e347823ff492832081f057af44ec65278c3e90d4 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 04:09:33 -0700
Subject: [PATCH 43/62] Log telemetry for index updates via push to API
 endpoint

---
 src/khoj/routers/indexer.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index d94b8330..215dfe57 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -3,8 +3,9 @@ import logging
 from typing import Optional, Union, Dict
 
 # External Packages
-from fastapi import APIRouter, HTTPException, Header, Response, UploadFile
+from fastapi import APIRouter, HTTPException, Header, Request, Response, UploadFile
 from pydantic import BaseModel
+from khoj.routers.helpers import update_telemetry_state
 
 # Internal Packages
 from khoj.utils import state, constants
@@ -57,10 +58,15 @@ class IndexerInput(BaseModel):
 
 @indexer.post("/batch")
 async def index_batch(
+    request: Request,
     files: list[UploadFile],
     x_api_key: str = Header(None),
     regenerate: bool = False,
     search_type: Optional[Union[state.SearchType, str]] = None,
+    client: Optional[str] = None,
+    user_agent: Optional[str] = Header(None),
+    referer: Optional[str] = Header(None),
+    host: Optional[str] = Header(None),
 ):
     if x_api_key != "secret":
         raise HTTPException(status_code=401, detail="Invalid API Key")
@@ -135,6 +141,17 @@ async def index_batch(
         logger.error(f"🚨 Failed to update content index via API: {e}", exc_info=True)
     finally:
         state.config_lock.release()
+
+    update_telemetry_state(
+        request=request,
+        telemetry_type="api",
+        api="index/update",
+        client=client,
+        user_agent=user_agent,
+        referer=referer,
+        host=host,
+    )
+
     logger.info("📪 Content index updated via API")
     return Response(content="OK", status_code=200)
 

From 84654ffc5d31ad7356b296296b5f507f038b5648 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 04:30:27 -0700
Subject: [PATCH 44/62] Update indexer API endpoint URL to index/update from
 indexer/batch

New URL follows action oriented endpoint naming convention used for
other Khoj API endpoints

Update desktop, obsidian and emacs client to call this new API
endpoint
---
 src/interface/desktop/main.js       | 2 +-
 src/interface/emacs/khoj.el         | 2 +-
 src/interface/obsidian/src/utils.ts | 2 +-
 src/khoj/configure.py               | 2 +-
 src/khoj/routers/indexer.py         | 4 ++--
 tests/test_client.py                | 8 ++++----
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js
index 17ab2fb4..53d98c6c 100644
--- a/src/interface/desktop/main.js
+++ b/src/interface/desktop/main.js
@@ -163,7 +163,7 @@ function pushDataToKhoj (regenerate = false) {
         const headers = {
             'x-api-key': 'secret'
         };
-        axios.post(`${hostURL}/api/v1/indexer/batch?regenerate=${regenerate}`, formData, { headers })
+        axios.post(`${hostURL}/api/v1/index/update?regenerate=${regenerate}`, formData, { headers })
             .then(response => {
                 console.log(response.data);
                 const win = BrowserWindow.getAllWindows()[0];
diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index 2956c025..e3441a1d 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -550,7 +550,7 @@ CONFIG is json obtained from Khoj config API."
           (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
                                        ("x-api-key" . ,khoj-server-api-key))))
       (with-current-buffer
-          (url-retrieve (format "%s/api/v1/indexer/batch" khoj-server-url)
+          (url-retrieve (format "%s/api/v1/index/update" khoj-server-url)
                         ;; render response from indexing API endpoint on server
                         (lambda (status)
                           (if (not status)
diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts
index 7fb04d24..8f004469 100644
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@@ -68,7 +68,7 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
     }
 
     // Call Khoj backend to update index with all markdown, pdf files
-    const response = await fetch(`${setting.khojUrl}/api/v1/indexer/batch?regenerate=${regenerate}`, {
+    const response = await fetch(`${setting.khojUrl}/api/v1/index/update?regenerate=${regenerate}`, {
         method: 'POST',
         headers: {
             'x-api-key': 'secret',
diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 7b2b3ce2..a7f39775 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -103,7 +103,7 @@ def configure_routes(app):
     app.mount("/static", StaticFiles(directory=constants.web_directory), name="static")
     app.include_router(api, prefix="/api")
     app.include_router(api_beta, prefix="/api/beta")
-    app.include_router(indexer, prefix="/api/v1/indexer")
+    app.include_router(indexer, prefix="/api/v1/index")
     app.include_router(web_client)
 
 
diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index 215dfe57..644712a5 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -56,8 +56,8 @@ class IndexerInput(BaseModel):
     plaintext: Optional[dict[str, str]] = None
 
 
-@indexer.post("/batch")
-async def index_batch(
+@indexer.post("/update")
+async def update(
     request: Request,
     files: list[UploadFile],
     x_api_key: str = Header(None),
diff --git a/tests/test_client.py b/tests/test_client.py
index 831668f7..d17f20fd 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -60,13 +60,13 @@ def test_regenerate_with_invalid_content_type(client):
 
 
 # ----------------------------------------------------------------------------------------------------
-def test_index_batch(client):
+def test_index_update(client):
     # Arrange
     files = get_sample_files_data()
     headers = {"x-api-key": "secret"}
 
     # Act
-    response = client.post("/api/v1/indexer/batch", files=files, headers=headers)
+    response = client.post("/api/v1/index/update", files=files, headers=headers)
 
     # Assert
     assert response.status_code == 200
@@ -80,7 +80,7 @@ def test_regenerate_with_valid_content_type(client):
         headers = {"x-api-key": "secret"}
 
         # Act
-        response = client.post(f"/api/v1/indexer/batch?search_type={content_type}", files=files, headers=headers)
+        response = client.post(f"/api/v1/index/update?search_type={content_type}", files=files, headers=headers)
         # Assert
         assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
 
@@ -95,7 +95,7 @@ def test_regenerate_with_github_fails_without_pat(client):
     headers = {"x-api-key": "secret"}
 
     # Act
-    response = client.post(f"/api/v1/indexer/batch?search_type=github", files=files, headers=headers)
+    response = client.post(f"/api/v1/index/update?search_type=github", files=files, headers=headers)
     # Assert
     assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
 

From 5efae1ad559fd4ffde4b10285eed429bd4e7da87 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 04:42:04 -0700
Subject: [PATCH 45/62] Update indexer API endpoint query params for force,
 content type

New URL query params, `force' and `t' match name of query parameter in
existing Khoj API endpoints

Update Desktop, Obsidian and Emacs client to call using these new API
query params. Set `client' query param from each client for telemetry
visibility
---
 src/interface/desktop/main.js       |  2 +-
 src/interface/emacs/khoj.el         | 20 ++++++++++++--------
 src/interface/obsidian/src/utils.ts |  2 +-
 src/khoj/routers/indexer.py         |  8 ++++----
 tests/test_client.py                |  4 ++--
 5 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js
index 53d98c6c..9b2ee49c 100644
--- a/src/interface/desktop/main.js
+++ b/src/interface/desktop/main.js
@@ -163,7 +163,7 @@ function pushDataToKhoj (regenerate = false) {
         const headers = {
             'x-api-key': 'secret'
         };
-        axios.post(`${hostURL}/api/v1/index/update?regenerate=${regenerate}`, formData, { headers })
+        axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers })
             .then(response => {
                 console.log(response.data);
                 const win = BrowserWindow.getAllWindows()[0];
diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index e3441a1d..e327bb82 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -537,12 +537,14 @@ CONFIG is json obtained from Khoj config API."
 ;; Khoj Index Content
 ;; -------------------
 
-(defun khoj--server-index-files (&optional file-paths)
-  "Send files at `FILE-PATHS' to the Khoj server to index for search and chat."
+(defun khoj--server-index-files (&optional force content-type file-paths)
+  "Send files at `FILE-PATHS' to the Khoj server to index for search and chat.
+`FORCE' re-indexes all files of `CONTENT-TYPE' even if they are already indexed."
   (interactive)
   (let ((boundary (format "-------------------------%d" (random (expt 10 10))))
         (files-to-index (or file-paths
                             (append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files)))
+        (type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type)))
         (inhibit-message t)
         (message-log-max nil))
     (let ((url-request-method "POST")
@@ -550,14 +552,18 @@ CONFIG is json obtained from Khoj config API."
           (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
                                        ("x-api-key" . ,khoj-server-api-key))))
       (with-current-buffer
-          (url-retrieve (format "%s/api/v1/index/update" khoj-server-url)
+          (url-retrieve (format "%s/api/v1/index/update?%s&force=%s&client=emacs" khoj-server-url type-query (or force "false"))
                         ;; render response from indexing API endpoint on server
                         (lambda (status)
                           (if (not status)
-                              (message "khoj.el: Updated Content Index")
+                              (message "khoj.el: %scontent index %supdated" (if content-type (format "%s " content-type) "") (if force "force " ""))
                             (with-current-buffer (current-buffer)
                               (goto-char "\n\n")
-                              (message "khoj.el: Failed to update Content Index. Status: %s. Response: %s" status (string-trim (buffer-substring-no-properties (point) (point-max)))))))
+                              (message "khoj.el: Failed to %supdate %s content index. Status: %s. Response: %s"
+                                       (if force "force " "")
+                                       content-type
+                                       status
+                                       (string-trim (buffer-substring-no-properties (point) (point-max)))))))
                         nil t t)))
     (setq khoj--indexed-files files-to-index)))
 
@@ -1141,12 +1147,10 @@ Paragraph only starts at first text after blank line."
     (let* ((force-update (if (member "--force-update" args) "true" "false"))
            ;; set content type to: specified > last used > based on current buffer > default type
            (content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
-           (type-query (if (equal content-type "all") "" (format "t=%s" content-type)))
-           (update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update))
            (url-request-method "GET"))
       (progn
         (setq khoj--content-type content-type)
-        (url-retrieve update-url (lambda (_) (message "khoj.el: %s index %supdated!" content-type (if (member "--force-update" args) "force " "")))))))
+        (khoj--server-index-files force-update content-type))))
 
   (transient-define-suffix khoj--chat-command (&optional _)
     "Command to Chat with Khoj."
diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts
index 8f004469..7e32eccd 100644
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@@ -68,7 +68,7 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
     }
 
     // Call Khoj backend to update index with all markdown, pdf files
-    const response = await fetch(`${setting.khojUrl}/api/v1/index/update?regenerate=${regenerate}`, {
+    const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
         method: 'POST',
         headers: {
             'x-api-key': 'secret',
diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index 644712a5..321b3788 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -61,8 +61,8 @@ async def update(
     request: Request,
     files: list[UploadFile],
     x_api_key: str = Header(None),
-    regenerate: bool = False,
-    search_type: Optional[Union[state.SearchType, str]] = None,
+    force: bool = False,
+    t: Optional[Union[state.SearchType, str]] = None,
     client: Optional[str] = None,
     user_agent: Optional[str] = Header(None),
     referer: Optional[str] = Header(None),
@@ -132,8 +132,8 @@ async def update(
             state.config.content_type,
             indexer_input.dict(),
             state.search_models,
-            regenerate=regenerate,
-            t=search_type,
+            regenerate=force,
+            t=t,
             full_corpus=False,
         )
 
diff --git a/tests/test_client.py b/tests/test_client.py
index d17f20fd..f012081c 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -80,7 +80,7 @@ def test_regenerate_with_valid_content_type(client):
         headers = {"x-api-key": "secret"}
 
         # Act
-        response = client.post(f"/api/v1/index/update?search_type={content_type}", files=files, headers=headers)
+        response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers)
         # Assert
         assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
 
@@ -95,7 +95,7 @@ def test_regenerate_with_github_fails_without_pat(client):
     headers = {"x-api-key": "secret"}
 
     # Act
-    response = client.post(f"/api/v1/index/update?search_type=github", files=files, headers=headers)
+    response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers)
     # Assert
     assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
 

From 6a4f1b218823dc39c9cef95e5db5b76eee866419 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 05:31:57 -0700
Subject: [PATCH 46/62] Add more client, request details in logs by
 index/update API endpoint

---
 src/khoj/routers/indexer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index 321b3788..a09e33f5 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -72,7 +72,7 @@ async def update(
         raise HTTPException(status_code=401, detail="Invalid API Key")
     state.config_lock.acquire()
     try:
-        logger.info("📬 Updating content index via API")
+        logger.info(f"📬 Updating content index via API call by {client}")
         org_files: Dict[str, str] = {}
         markdown_files: Dict[str, str] = {}
         pdf_files: Dict[str, str] = {}
@@ -95,7 +95,7 @@ async def update(
                     file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
                 )
             else:
-                logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}")
+                logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
 
         indexer_input = IndexerInput(
             org=org_files,
@@ -138,7 +138,9 @@ async def update(
         )
 
     except Exception as e:
-        logger.error(f"🚨 Failed to update content index via API: {e}", exc_info=True)
+        logger.error(
+            f"🚨 Failed to {force} update {t} content index triggered via API call by {client}: {e}", exc_info=True
+        )
     finally:
         state.config_lock.release()
 
@@ -152,7 +154,7 @@ async def update(
         host=host,
     )
 
-    logger.info("📪 Content index updated via API")
+    logger.info(f"📪 Content index updated via API call by {client}")
     return Response(content="OK", status_code=200)
 
 

From 7b1c62ba53b20f5a8456e6cbb7a75d725dafc9e8 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 05:55:39 -0700
Subject: [PATCH 47/62] Mark test_get_configured_types_via_api unit test as
 flaky

It passes locally on running individually but fails when run in
parallel on local or CI
---
 tests/test_client.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_client.py b/tests/test_client.py
index f012081c..55bf09f7 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -6,6 +6,7 @@ from urllib.parse import quote
 
 # External Packages
 from fastapi.testclient import TestClient
+import pytest
 
 # Internal Packages
 from khoj.main import app
@@ -101,6 +102,7 @@ def test_regenerate_with_github_fails_without_pat(client):
 
 
 # ----------------------------------------------------------------------------------------------------
+@pytest.mark.skip(reason="Flaky test on parallel test runs")
 def test_get_configured_types_via_api(client):
     # Act
     response = client.get(f"/api/config/types")

From b8976426eb7799cf5a4a9f34354bf005aebec9c5 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 06:30:20 -0700
Subject: [PATCH 48/62] Update offline chat model config schema used by Emacs,
 Obsidian clients

The server uses a new schema for the conversation config. The Emacs,
Obsidian clients need to use this schema to update the conversation
config
---
 src/interface/emacs/khoj.el         | 20 +++++++++++++++-----
 src/interface/obsidian/src/utils.ts | 28 +++++++++++++++++++++++-----
 src/khoj/utils/constants.py         | 14 ++++++++++++--
 3 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index 70980241..55d4bbb4 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -261,6 +261,11 @@ for example), set this to the full interpreter path."
   :type 'boolean
   :group 'khoj)
 
+(defcustom khoj-offline-chat-model nil
+  "Specify chat model to use for offline chat with khoj."
+  :type 'string
+  :group 'khoj)
+
 (defcustom khoj-auto-setup t
   "Automate install, configure and start of khoj server.
 Auto invokes setup steps on calling main entrypoint."
@@ -405,7 +410,8 @@ CONFIG is json obtained from Khoj config API."
          (default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
          (default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
          (chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config))))))
-         (enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))
+         (enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
+         (offline-chat-model (or khoj-offline-chat-model (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
          (config (or current-config default-config)))
 
     ;; Configure content types
@@ -469,7 +475,8 @@ CONFIG is json obtained from Khoj config API."
       (message "khoj.el: Chat not configured yet.")
       (setq config (delq (assoc 'processor config) config))
       (cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
-                                                  (enable-offline-chat . ,enable-offline-chat)
+                                                  (offline-chat . ((enable-offline-chat . ,enable-offline-chat)
+                                                                   (chat-model . ,offline-chat-model)))
                                                   (openai . ((chat-model . ,chat-model)
                                                              (api-key . ,khoj-openai-api-key)))))))
                   config))
@@ -480,7 +487,8 @@ CONFIG is json obtained from Khoj config API."
        (let ((new-processor-type (alist-get 'processor config)))
          (setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
          (cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
-                                       (enable-offline-chat . ,enable-offline-chat)
+                                       (offline-chat . ((enable-offline-chat . ,enable-offline-chat)
+                                                        (chat-model . ,offline-chat-model)))
                                        (openai . ((chat-model . ,chat-model)
                                                   (api-key . ,khoj-openai-api-key)))))
                      new-processor-type)
@@ -490,13 +498,15 @@ CONFIG is json obtained from Khoj config API."
      ;; Else if chat configuration in khoj backend has gone stale
      ((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key)
                 (equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model)
-                (equal (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor config))) enable-offline-chat)))
+                (equal (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) enable-offline-chat)
+                (equal (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) offline-chat-model)))
       (message "khoj.el: Chat configuration has gone stale.")
       (let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile)))
              (new-processor-type (alist-get 'processor config)))
         (setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
         (cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory))
-                                      (enable-offline-chat . ,enable-offline-chat)
+                                      (offline-chat . ((enable-offline-chat . ,enable-offline-chat)
+                                                       (chat-model . ,offline-chat-model)))
                                       (openai . ((chat-model . ,khoj-chat-model)
                                                  (api-key . ,khoj-openai-api-key)))))
                     new-processor-type)
diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts
index 7e32eccd..ace130e3 100644
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@@ -14,11 +14,18 @@ type OpenAIType = null | {
     "api-key": string;
 };
 
+type OfflineChatType = null | {
+    "chat-model": string;
+    "enable-offline-chat": boolean;
+};
+
 interface ProcessorData {
     conversation: {
       "conversation-logfile": string;
       openai: OpenAIType;
-      "enable-offline-chat": boolean;
+      "offline-chat": OfflineChatType;
+      "tokenizer": null | string;
+      "max-prompt-size": null | number;
     };
 }
 
@@ -106,7 +113,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
     // Get default config fields from khoj backend
     let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
     let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
-    let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
+    let khojDefaultOpenAIChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
+    let khojDefaultOfflineChatModelName = defaultConfig["processor"]["conversation"]["offline-chat"]["chat-model"];
 
     // Get current config if khoj backend configured, else get default config from khoj backend
     await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
@@ -117,13 +125,18 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
                 "conversation": {
                     "conversation-logfile": conversationLogFile,
                     "openai": null,
-                    "enable-offline-chat": setting.enableOfflineChat,
+                    "offline-chat": {
+                        "chat-model": khojDefaultOfflineChatModelName,
+                        "enable-offline-chat": setting.enableOfflineChat,
+                    },
+                    "tokenizer": null,
+                    "max-prompt-size": null,
                 }
             }
 
             // If the Open AI API Key was configured in the plugin settings
             if (!!setting.openaiApiKey) {
-                let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName;
+                let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultOpenAIChatModelName;
                 processorData = {
                     "conversation": {
                         "conversation-logfile": conversationLogFile,
@@ -131,7 +144,12 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
                             "chat-model": openAIChatModel,
                             "api-key": setting.openaiApiKey,
                         },
-                        "enable-offline-chat": setting.enableOfflineChat,
+                        "offline-chat": {
+                            "chat-model": khojDefaultOfflineChatModelName,
+                            "enable-offline-chat": setting.enableOfflineChat,
+                        },
+                        "tokenizer": null,
+                        "max-prompt-size": null,
                     },
                 }
             }
diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
index 8da50d76..9ed97798 100644
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@@ -53,7 +53,12 @@ empty_config = {
                 "api-key": None,
                 "chat-model": "gpt-3.5-turbo",
             },
-            "enable-offline-chat": False,
+            "offline-chat": {
+                "enable-offline-chat": False,
+                "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
+            },
+            "tokenizer": None,
+            "max-prompt-size": None,
             "conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
         }
     },
@@ -125,7 +130,12 @@ default_config = {
                 "api-key": None,
                 "chat-model": "gpt-3.5-turbo",
             },
-            "enable-offline-chat": False,
+            "offline-chat": {
+                "enable-offline-chat": False,
+                "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
+            },
+            "tokenizer": None,
+            "max-prompt-size": None,
             "conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
         }
     },

From 2646c8554dfe0a0a9fe6499a3dc6ce6c85a40764 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Tue, 17 Oct 2023 10:35:13 -0700
Subject: [PATCH 49/62] Provide a default value to offline_chat configuration
 of the conversation processor

---
 src/khoj/routers/api.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 7e0ab522..1512afd0 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -30,6 +30,7 @@ from khoj.utils.rawconfig import (
     GithubContentConfig,
     NotionContentConfig,
     ConversationProcessorConfig,
+    OfflineChatProcessorConfig,
 )
 from khoj.utils.helpers import resolve_absolute_path
 from khoj.utils.state import SearchType
@@ -302,6 +303,9 @@ if not state.demo:
             state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile))  # type: ignore
 
         assert state.config.processor.conversation is not None
+        if state.config.processor.conversation.offline_chat is None:
+            state.config.processor.conversation.offline_chat = OfflineChatProcessorConfig()
+
         state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat
         if offline_chat_model is not None:
             state.config.processor.conversation.offline_chat.chat_model = offline_chat_model

From 3d7381446d695353b5b315065afa602e2202a0ed Mon Sep 17 00:00:00 2001
From: Andrew Spott <andrew.spott@gmail.com>
Date: Tue, 17 Oct 2023 12:26:06 -0600
Subject: [PATCH 50/62] =?UTF-8?q?Changed=20globbing.=20=20Now=20doesn't=20?=
 =?UTF-8?q?clobber=20a=20users=20glob=20if=20they=20want=20to=20a=E2=80=A6?=
 =?UTF-8?q?=20(#496)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Changed globbing.  Now doesn't clobber a users glob if they want to add it, but will (if just given a directory), add a recursive glob.

Note: python's glob engine doesn't support `{}` globing, a future option is to warn if that is included.

* Fix typo in globformat variable

* Use older glob pattern for plaintext files

---------

Co-authored-by: Saba <narmiabas@gmail.com>
---
 .../interface/web/content_type_input.html     | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/khoj/interface/web/content_type_input.html b/src/khoj/interface/web/content_type_input.html
index 3ef512f8..1f0dfa76 100644
--- a/src/khoj/interface/web/content_type_input.html
+++ b/src/khoj/interface/web/content_type_input.html
@@ -34,7 +34,7 @@
                             <input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}">
                         {% else %}
                             {% for input_filter in current_config['input_filter'] %}
-                                <input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter.split('/*')[0] }}">
+                                <input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter }}">
                             {% endfor %}
                         {% endif %}
                     </td>
@@ -106,17 +106,18 @@
 
     submit.addEventListener("click", function(event) {
         event.preventDefault();
-        let globFormat = "**/*."
+        let globFormat = "**/*"
         let suffixes = [];
         if ('{{content_type}}' == "markdown")
-            suffixes = ["md", "markdown"]
+            suffixes = [".md", ".markdown"]
         else if ('{{content_type}}' == "org")
-            suffixes = ["org"]
+            suffixes = [".org"]
         else if ('{{content_type}}' === "pdf")
-            suffixes = ["pdf"]
+            suffixes = [".pdf"]
         else if ('{{content_type}}' === "plaintext")
-            suffixes = ['*']
+            suffixes = ['.*']
 
+        let globs = suffixes.map(x => `${globFormat}${x}`)
         var inputFileNodes = document.getElementsByName("input-files");
         var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value);
 
@@ -124,10 +125,19 @@
 
         var inputFilter = [];
         var nodes = getValidInputNodes(inputFilterNodes);
+
+        // A regex that checks for globs in the path.  If they exist,
+        // we are going to just not add our own globing.  If they don't,
+        // then we will assume globbing should be done.
+        const glob_regex = /([*?\[\]])/;
         if (nodes.length > 0) {
             for (var i = 0; i < nodes.length; i++) {
-                for (var j = 0; j < suffixes.length; j++) {
-                    inputFilter.push(nodes[i].value + globFormat + suffixes[j]);
+                for (var j = 0; j < globs.length; j++) {
+                    if (glob_regex.test(nodes[i].value)) {
+                        inputFilter.push(nodes[i].value);
+                    } else {
+                        inputFilter.push(nodes[i].value + globs[j]);
+                    }
                 }
             }
         }

From ba60c869c954361a59c2e728c97d8dc4aa0babdd Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Tue, 17 Oct 2023 13:05:50 -0700
Subject: [PATCH 51/62] Fix encoding binary files like PDFs for sync from
 Desktop client

Use readFileSync, Buffer to pass appropriately formatted binary data
---
 src/interface/desktop/main.js | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js
index 9b2ee49c..e77a3363 100644
--- a/src/interface/desktop/main.js
+++ b/src/interface/desktop/main.js
@@ -135,9 +135,10 @@ function pushDataToKhoj (regenerate = false) {
         }
 
         try {
-            encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8";
-            mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
-            fileObj = new Blob([fs.createReadStream(file, encoding)], { type: mimeType });
+            let encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8";
+            let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
+            let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding);
+            let fileObj = new Blob([fileContent], { type: mimeType });
             formData.append('files', fileObj, file);
             state[file] = {
                 success: true,

From c8293998d95c36e25450b127ae84d11d1c454698 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 15:07:05 -0700
Subject: [PATCH 52/62] Fix encoding binary files like PDFs for sync from
 Obsidian client

Use readBinary to read binary files like PDFs instead of read
---
 src/interface/obsidian/src/utils.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts
index ace130e3..02d7e272 100644
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@@ -62,7 +62,7 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
         countOfFilesToIndex++;
         const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
         const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
-        const fileContent = await vault.read(file);
+        const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file);
         formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
     }
 

From d9d133dfb9d08b32b0ae482fde5462bc39c3f853 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 21:31:15 -0700
Subject: [PATCH 53/62] Read text files as utf-8, instead of default os locale

On Windows, the default locale isn't utf8. Khoj had regressed to
reading files in OS specified locale encoding, e.g cp1252, cp949 etc.

It now explicitly uses utf8 encoding to read text files for indexing

Resolves #495, resolves #472
---
 src/khoj/routers/indexer.py |  7 ++++---
 src/khoj/utils/fs_syncer.py |  9 ++++-----
 tests/test_text_search.py   | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index a09e33f5..a9656050 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -72,7 +72,7 @@ async def update(
         raise HTTPException(status_code=401, detail="Invalid API Key")
     state.config_lock.acquire()
     try:
-        logger.info(f"📬 Updating content index via API call by {client}")
+        logger.info(f"📬 Updating content index via API call by {client} client")
         org_files: Dict[str, str] = {}
         markdown_files: Dict[str, str] = {}
         pdf_files: Dict[str, str] = {}
@@ -139,7 +139,8 @@ async def update(
 
     except Exception as e:
         logger.error(
-            f"🚨 Failed to {force} update {t} content index triggered via API call by {client}: {e}", exc_info=True
+            f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}",
+            exc_info=True,
         )
     finally:
         state.config_lock.release()
@@ -154,7 +155,7 @@ async def update(
         host=host,
     )
 
-    logger.info(f"📪 Content index updated via API call by {client}")
+    logger.info(f"📪 Content index updated via API call by {client} client")
     return Response(content="OK", status_code=200)
 
 
diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py
index 5cf97add..1745b760 100644
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -1,6 +1,5 @@
 import logging
 import glob
-import base64
 from typing import Optional
 from bs4 import BeautifulSoup
 
@@ -69,7 +68,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
 
     filename_to_content_map = {}
     for file in all_target_files:
-        with open(file, "r") as f:
+        with open(file, "r", encoding="utf8") as f:
             try:
                 plaintext_content = f.read()
                 if file.endswith(("html", "htm", "xml")):
@@ -115,7 +114,7 @@ def get_org_files(config: TextContentConfig):
 
     filename_to_content_map = {}
     for file in all_org_files:
-        with open(file, "r") as f:
+        with open(file, "r", encoding="utf8") as f:
             try:
                 filename_to_content_map[file] = f.read()
             except Exception as e:
@@ -137,7 +136,7 @@ def get_markdown_files(config: TextContentConfig):
         logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
         return {}
 
-    "Get Markdown files to process"
+    # Get markdown files to process
     absolute_markdown_files, filtered_markdown_files = set(), set()
     if markdown_files:
         absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
@@ -164,7 +163,7 @@ def get_markdown_files(config: TextContentConfig):
 
     filename_to_content_map = {}
     for file in all_markdown_files:
-        with open(file, "r") as f:
+        with open(file, "r", encoding="utf8") as f:
             try:
                 filename_to_content_map[file] = f.read()
             except Exception as e:
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index b1a9aa4d..60246a61 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -1,26 +1,25 @@
 # System Packages
 import logging
+import locale
 from pathlib import Path
 import os
 
 # External Packages
 import pytest
-from khoj.utils.config import SearchModels
 
 # Internal Packages
 from khoj.utils.state import content_index, search_models
 from khoj.search_type import text_search
-from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.github.github_to_jsonl import GithubToJsonl
+from khoj.utils.config import SearchModels
 from khoj.utils.fs_syncer import get_org_files
+from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
 
 
 # Test
 # ----------------------------------------------------------------------------------------------------
-def test_text_search_setup_with_missing_file_raises_error(
-    org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
-):
+def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
     # Arrange
     # Ensure file mentioned in org.input-files is missing
     single_new_file = Path(org_config_with_only_new_file.input_files[0])
@@ -29,7 +28,7 @@ def test_text_search_setup_with_missing_file_raises_error(
     # Act
     # Generate notes embeddings during asymmetric setup
     with pytest.raises(FileNotFoundError):
-        data = get_org_files(org_config_with_only_new_file)
+        get_org_files(org_config_with_only_new_file)
 
 
 # ----------------------------------------------------------------------------------------------------
@@ -48,6 +47,7 @@ def test_text_search_setup_with_empty_file_raises_error(
 def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
     # Arrange
     data = get_org_files(content_config.org)
+
     # Act
     # Regenerate notes embeddings during asymmetric setup
     notes_model = text_search.setup(

From 51363d280d5eed92eb6bad9b5d5ca03a0b2db953 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 21:44:54 -0700
Subject: [PATCH 54/62] Do not configure khoj server for pull based indexing
 from khoj.el

Do not make khoj server pull update index on Obsidian plugin load.
Index is updated on push from plugin instead now/
---
 src/interface/emacs/khoj.el         | 49 ++---------------------------
 src/interface/obsidian/src/utils.ts |  6 +---
 2 files changed, 3 insertions(+), 52 deletions(-)

diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index 55d4bbb4..bb81e726 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -397,8 +397,7 @@ CONFIG is json obtained from Khoj config API."
 (defun khoj--server-configure ()
   "Configure the Khoj server for search and chat."
   (interactive)
-  (let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null))
-         (url-request-method "GET")
+  (let* ((url-request-method "GET")
          (current-config
           (with-temp-buffer
             (url-insert-file-contents (format "%s/api/config/data" khoj-server-url))
@@ -407,56 +406,12 @@ CONFIG is json obtained from Khoj config API."
            (with-temp-buffer
              (url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url))
              (ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
-         (default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
          (default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
          (chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config))))))
          (enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
          (offline-chat-model (or khoj-offline-chat-model (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
          (config (or current-config default-config)))
 
-    ;; Configure content types
-    (cond
-     ;; If khoj backend is not configured yet
-     ((not current-config)
-      (message "khoj.el: Server not configured yet.")
-      (setq config (delq (assoc 'content-type config) config))
-      (cl-pushnew `(content-type . ((org . ((input-files . ,khoj-org-files)
-                                            (input-filter . ,org-directory-regexes)
-                                            (compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
-                                            (embeddings-file . ,(format "%s/org.pt" default-index-dir))
-                                            (index-heading-entries . ,json-false)))))
-                  config))
-
-     ;; Else if khoj config has no org content config
-     ((not (alist-get 'org (alist-get 'content-type config)))
-      (message "khoj.el: Org-mode content on server not configured yet.")
-     (let ((new-content-type (alist-get 'content-type config)))
-        (setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
-        (cl-pushnew `(org . ((input-files . ,khoj-org-files)
-                             (input-filter . ,org-directory-regexes)
-                             (compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
-                             (embeddings-file . ,(format "%s/org.pt" default-index-dir))
-                             (index-heading-entries . ,json-false)))
-                    new-content-type)
-        (setq config (delq (assoc 'content-type config) config))
-        (cl-pushnew `(content-type . ,new-content-type) config)))
-
-     ;; Else if khoj is not configured to index specified org files
-     ((not (and (equal (alist-get 'input-files (alist-get 'org (alist-get 'content-type config))) khoj-org-files)
-                (equal (alist-get 'input-filter (alist-get 'org (alist-get 'content-type config))) org-directory-regexes)))
-      (message "khoj.el: Org-mode content on server is stale.")
-      (let* ((index-directory (khoj--get-directory-from-config config '(content-type org embeddings-file)))
-             (new-content-type (alist-get 'content-type config)))
-        (setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
-        (cl-pushnew `(org . ((input-files . ,khoj-org-files)
-                             (input-filter . ,org-directory-regexes)
-                             (compressed-jsonl . ,(format "%s/org.jsonl.gz" index-directory))
-                             (embeddings-file . ,(format "%s/org.pt" index-directory))
-                             (index-heading-entries . ,json-false)))
-                    new-content-type)
-        (setq config (delq (assoc 'content-type config) config))
-        (cl-pushnew `(content-type . ,new-content-type) config))))
-
     ;; Configure processors
     (cond
      ((not khoj-openai-api-key)
@@ -472,7 +427,7 @@ CONFIG is json obtained from Khoj config API."
 
      ;; If khoj backend isn't configured yet
      ((not current-config)
-      (message "khoj.el: Chat not configured yet.")
+      (message "khoj.el: Khoj not configured yet.")
       (setq config (delq (assoc 'processor config) config))
       (cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
                                                   (offline-chat . ((enable-offline-chat . ,enable-offline-chat)
diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts
index 02d7e272..eb3d4d12 100644
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@@ -178,12 +178,8 @@ export async function updateKhojBackend(khojUrl: string, khojConfig: Object) {
         method: 'POST',
         contentType: 'application/json',
     };
-
     // Save khojConfig on khoj backend at khojConfigUrl
-    await request(requestContent)
-        // Refresh khoj search index after updating config
-        .then(_ => request(`${khojUrl}/api/update?t=markdown`))
-        .then(_ => request(`${khojUrl}/api/update?t=pdf`));
+    request(requestContent);
 }
 
 function getIndexDirectoryFromBackendConfig(filepath: string) {

From e3cd8b415061c5167861c7ca8435b4eb521a712a Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 22:59:10 -0700
Subject: [PATCH 55/62] Only index files returned by input-filter globs in
 fs_syncer

Ignore .org, .pdf etc. suffixed directories under `input-filter' from
being evaluated as files.

Explicitly filter results by input-filter globs to only index files,
not directory for each text type

Add test to prevent regression

Closes #448
---
 src/khoj/utils/fs_syncer.py |  5 +++++
 tests/test_text_search.py   | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py
index 1745b760..12c4e5dc 100644
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -1,5 +1,6 @@
 import logging
 import glob
+import os
 from typing import Optional
 from bs4 import BeautifulSoup
 
@@ -53,6 +54,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
             filtered_file
             for jsonl_file_filter in input_filter
             for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
+            if os.path.isfile(filtered_file)
         }
 
     all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
@@ -102,6 +104,7 @@ def get_org_files(config: TextContentConfig):
             filtered_file
             for org_file_filter in org_file_filter
             for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
+            if os.path.isfile(filtered_file)
         }
 
     all_org_files = sorted(absolute_org_files | filtered_org_files)
@@ -146,6 +149,7 @@ def get_markdown_files(config: TextContentConfig):
             filtered_file
             for markdown_file_filter in markdown_file_filter
             for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
+            if os.path.isfile(filtered_file)
         }
 
     all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
@@ -194,6 +198,7 @@ def get_pdf_files(config: TextContentConfig):
             filtered_file
             for pdf_file_filter in pdf_file_filter
             for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
+            if os.path.isfile(filtered_file)
         }
 
     all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 60246a61..179718fa 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -31,6 +31,22 @@ def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_n
         get_org_files(org_config_with_only_new_file)
 
 
+# ----------------------------------------------------------------------------------------------------
+def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path):
+    # Arrange
+    orgfile = tmp_path / "directory.org" / "file.org"
+    orgfile.parent.mkdir()
+    with open(orgfile, "w") as f:
+        f.write("* Heading\n- List item\n")
+    org_content_config = TextContentConfig(
+        input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt"
+    )
+
+    # Act
+    # should not raise IsADirectoryError and return orgfile
+    assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"}
+
+
 # ----------------------------------------------------------------------------------------------------
 def test_text_search_setup_with_empty_file_raises_error(
     org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig

From cf1cdc3fe18446b8fea1fe47e4dd9327aea9ce1c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 23:30:45 -0700
Subject: [PATCH 56/62] Disambiguate input_filter variable names in fs_syncer
 functions

---
 src/khoj/utils/fs_syncer.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py
index 12c4e5dc..74619581 100644
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -35,13 +35,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
         return soup.get_text(strip=True, separator="\n")
 
     # Extract required fields from config
-    input_files, input_filter = (
+    input_files, input_filters = (
         config.input_files,
         config.input_filter,
     )
 
     # Input Validation
-    if is_none_or_empty(input_files) and is_none_or_empty(input_filter):
+    if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
         logger.debug("At least one of input-files or input-file-filter is required to be specified")
         return {}
 
@@ -49,11 +49,11 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
     absolute_plaintext_files, filtered_plaintext_files = set(), set()
     if input_files:
         absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
-    if input_filter:
+    if input_filters:
         filtered_plaintext_files = {
             filtered_file
-            for jsonl_file_filter in input_filter
-            for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
+            for plaintext_file_filter in input_filters
+            for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
             if os.path.isfile(filtered_file)
         }
 
@@ -85,13 +85,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
 
 def get_org_files(config: TextContentConfig):
     # Extract required fields from config
-    org_files, org_file_filter = (
+    org_files, org_file_filters = (
         config.input_files,
         config.input_filter,
     )
 
     # Input Validation
-    if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
+    if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
         logger.debug("At least one of org-files or org-file-filter is required to be specified")
         return {}
 
@@ -99,10 +99,10 @@ def get_org_files(config: TextContentConfig):
     absolute_org_files, filtered_org_files = set(), set()
     if org_files:
         absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
-    if org_file_filter:
+    if org_file_filters:
         filtered_org_files = {
             filtered_file
-            for org_file_filter in org_file_filter
+            for org_file_filter in org_file_filters
             for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
             if os.path.isfile(filtered_file)
         }
@@ -129,13 +129,13 @@ def get_org_files(config: TextContentConfig):
 
 def get_markdown_files(config: TextContentConfig):
     # Extract required fields from config
-    markdown_files, markdown_file_filter = (
+    markdown_files, markdown_file_filters = (
         config.input_files,
         config.input_filter,
     )
 
     # Input Validation
-    if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
+    if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
         logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
         return {}
 
@@ -144,10 +144,10 @@ def get_markdown_files(config: TextContentConfig):
     if markdown_files:
         absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
 
-    if markdown_file_filter:
+    if markdown_file_filters:
         filtered_markdown_files = {
             filtered_file
-            for markdown_file_filter in markdown_file_filter
+            for markdown_file_filter in markdown_file_filters
             for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
             if os.path.isfile(filtered_file)
         }
@@ -179,13 +179,13 @@ def get_markdown_files(config: TextContentConfig):
 
 def get_pdf_files(config: TextContentConfig):
     # Extract required fields from config
-    pdf_files, pdf_file_filter = (
+    pdf_files, pdf_file_filters = (
         config.input_files,
         config.input_filter,
     )
 
     # Input Validation
-    if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter):
+    if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
         logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
         return {}
 
@@ -193,10 +193,10 @@ def get_pdf_files(config: TextContentConfig):
     absolute_pdf_files, filtered_pdf_files = set(), set()
     if pdf_files:
         absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
-    if pdf_file_filter:
+    if pdf_file_filters:
         filtered_pdf_files = {
             filtered_file
-            for pdf_file_filter in pdf_file_filter
+            for pdf_file_filter in pdf_file_filters
             for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
             if os.path.isfile(filtered_file)
         }

From 71b0012e8c0f0860775a41d2d94d6c1e4180918e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 18 Oct 2023 00:59:43 -0700
Subject: [PATCH 57/62] Set offline chat config to default value if unset on
 server load

---
 src/khoj/utils/config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index 3930ec98..cdc0d260 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -12,6 +12,8 @@ from khoj.processor.conversation.gpt4all.utils import download_model
 # External Packages
 import torch
 
+from khoj.utils.rawconfig import OfflineChatProcessorConfig
+
 logger = logging.getLogger(__name__)
 
 # Internal Packages
@@ -94,7 +96,7 @@ class ConversationProcessorConfigModel:
     ):
         self.openai_model = conversation_config.openai
         self.gpt4all_model = GPT4AllProcessorConfig()
-        self.offline_chat = conversation_config.offline_chat
+        self.offline_chat = conversation_config.offline_chat or OfflineChatProcessorConfig()
         self.max_prompt_size = conversation_config.max_prompt_size
         self.tokenizer = conversation_config.tokenizer
         self.conversation_logfile = Path(conversation_config.conversation_logfile)

From 53abd1a5063dca5529fbc3f64da6ded678c89030 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 18 Oct 2023 01:00:41 -0700
Subject: [PATCH 58/62] Mark sync completed on desktop client, even when no
 files to send

Previously Sync spinner on desktop config screen would hang when no
files to send to server & the Sync button had been manually triggered
---
 src/interface/desktop/main.js | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js
index e77a3363..fd75e3a7 100644
--- a/src/interface/desktop/main.js
+++ b/src/interface/desktop/main.js
@@ -111,10 +111,12 @@ function pushDataToKhoj (regenerate = false) {
     const folders = store.get('folders') || [];
     state = { completed: true }
 
+    // Collect paths of all configured files to index
     for (const file of files) {
         filesToPush.push(file.path);
     }
 
+    // Collect paths of all indexable files in configured folders
     for (const folder of folders) {
         const files = fs.readdirSync(folder.path, { withFileTypes: true });
         for (const file of files) {
@@ -129,11 +131,13 @@ function pushDataToKhoj (regenerate = false) {
     for (const file of filesToPush) {
         const stats = fs.statSync(file);
         if (!regenerate) {
+            // Only push files that have been modified since last sync
             if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) {
                 continue;
             }
         }
 
+        // Collect all updated or newly created files since last sync to index on Khoj server
         try {
             let encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8";
             let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
@@ -152,6 +156,7 @@ function pushDataToKhoj (regenerate = false) {
         }
     }
 
+    // Mark deleted files for removal from index on Khoj server
     for (const syncedFile of lastSync) {
         if (!filesToPush.includes(syncedFile.path)) {
             fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) });
@@ -159,6 +164,7 @@ function pushDataToKhoj (regenerate = false) {
         }
     }
 
+    // Send collected files to Khoj server for indexing
     if (!!formData?.entries()?.next().value) {
         const hostURL = store.get('hostURL') || KHOJ_URL;
         const headers = {
@@ -167,8 +173,6 @@ function pushDataToKhoj (regenerate = false) {
         axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers })
             .then(response => {
                 console.log(response.data);
-                const win = BrowserWindow.getAllWindows()[0];
-                win.webContents.send('update-state', state);
                 let lastSync = [];
                 for (const file of filesToPush) {
                     lastSync.push({
@@ -181,9 +185,16 @@ function pushDataToKhoj (regenerate = false) {
             .catch(error => {
                 console.error(error);
                 state['completed'] = false
+            })
+            .finally(() => {
+                // Syncing complete
                 const win = BrowserWindow.getAllWindows()[0];
-                win.webContents.send('update-state', state);
+                if (win) win.webContents.send('update-state', state);
             });
+    } else {
+        // Syncing complete
+        const win = BrowserWindow.getAllWindows()[0];
+        if (win) win.webContents.send('update-state', state);
     }
 }
 

From 6631fc38dbcb3ebbab1576bfff798b9a910b0ca2 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 18 Oct 2023 03:23:17 -0700
Subject: [PATCH 59/62] Delete plaintext config via API. Catch any offline
 model loading exception

---
 src/interface/obsidian/src/settings.ts | 2 +-
 src/khoj/routers/api.py                | 4 ++++
 src/khoj/utils/config.py               | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/interface/obsidian/src/settings.ts b/src/interface/obsidian/src/settings.ts
index 9b672659..c3f40905 100644
--- a/src/interface/obsidian/src/settings.ts
+++ b/src/interface/obsidian/src/settings.ts
@@ -1,4 +1,4 @@
-import { App, Notice, PluginSettingTab, request, Setting, TFile } from 'obsidian';
+import { App, Notice, PluginSettingTab, Setting, TFile } from 'obsidian';
 import Khoj from 'src/main';
 import { updateContentIndex } from './utils';
 
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 1512afd0..345429e8 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -186,6 +186,10 @@ if not state.demo:
             state.content_index.markdown = None
         elif content_type == "org":
             state.content_index.org = None
+        elif content_type == "plaintext":
+            state.content_index.plaintext = None
+        else:
+            logger.warning(f"Request to delete unknown content type: {content_type} via API")
 
         try:
             save_config_to_file_updated_state()
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index cdc0d260..5b3b9f6e 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -106,7 +106,7 @@ class ConversationProcessorConfigModel:
         if self.offline_chat.enable_offline_chat:
             try:
                 self.gpt4all_model.loaded_model = download_model(self.offline_chat.chat_model)
-            except ValueError as e:
+            except Exception as e:
                 self.offline_chat.enable_offline_chat = False
                 self.gpt4all_model.loaded_model = None
                 logger.error(f"Error while loading offline chat model: {e}", exc_info=True)

From 8346e1193cf31ce8d66de7793b958bdd06c9d2b9 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 18 Oct 2023 03:43:16 -0700
Subject: [PATCH 60/62] Release Khoj version 0.13.0

---
 manifest.json                        | 2 +-
 src/interface/desktop/package.json   | 2 +-
 src/interface/emacs/khoj.el          | 2 +-
 src/interface/obsidian/manifest.json | 2 +-
 src/interface/obsidian/package.json  | 2 +-
 src/interface/obsidian/versions.json | 3 ++-
 versions.json                        | 3 ++-
 7 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/manifest.json b/manifest.json
index 0ecc4fbb..0d5c71b8 100644
--- a/manifest.json
+++ b/manifest.json
@@ -1,7 +1,7 @@
 {
 	"id": "khoj",
 	"name": "Khoj",
-	"version": "0.12.3",
+	"version": "0.13.0",
 	"minAppVersion": "0.15.0",
 	"description": "An Open-Source AI Personal Assistant for your Digital Brain",
 	"author": "Khoj Inc.",
diff --git a/src/interface/desktop/package.json b/src/interface/desktop/package.json
index 0b5f220c..d74e831a 100644
--- a/src/interface/desktop/package.json
+++ b/src/interface/desktop/package.json
@@ -1,6 +1,6 @@
 {
   "name": "Khoj",
-  "version": "0.12.3",
+  "version": "0.13.0",
   "description": "An AI copilot for your Second Brain",
   "author": "Saba Imran, Debanjum Singh Solanky <team@khoj.dev>",
   "license": "GPL-3.0-or-later",
diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el
index bb81e726..b9343c41 100644
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@@ -6,7 +6,7 @@
 ;;         Saba Imran <saba@khoj.dev>
 ;; Description: An AI copilot for your Second Brain
 ;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
-;; Version: 0.12.3
+;; Version: 0.13.0
 ;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
 ;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs
 
diff --git a/src/interface/obsidian/manifest.json b/src/interface/obsidian/manifest.json
index 0ecc4fbb..0d5c71b8 100644
--- a/src/interface/obsidian/manifest.json
+++ b/src/interface/obsidian/manifest.json
@@ -1,7 +1,7 @@
 {
 	"id": "khoj",
 	"name": "Khoj",
-	"version": "0.12.3",
+	"version": "0.13.0",
 	"minAppVersion": "0.15.0",
 	"description": "An Open-Source AI Personal Assistant for your Digital Brain",
 	"author": "Khoj Inc.",
diff --git a/src/interface/obsidian/package.json b/src/interface/obsidian/package.json
index 07c47140..beb049fa 100644
--- a/src/interface/obsidian/package.json
+++ b/src/interface/obsidian/package.json
@@ -1,6 +1,6 @@
 {
     "name": "Khoj",
-    "version": "0.12.3",
+    "version": "0.13.0",
     "description": "An AI copilot for your Second Brain",
     "author": "Debanjum Singh Solanky, Saba Imran <team@khoj.dev>",
     "license": "GPL-3.0-or-later",
diff --git a/src/interface/obsidian/versions.json b/src/interface/obsidian/versions.json
index cf60cf10..9cc1eb5c 100644
--- a/src/interface/obsidian/versions.json
+++ b/src/interface/obsidian/versions.json
@@ -24,5 +24,6 @@
 	"0.12.0": "0.15.0",
 	"0.12.1": "0.15.0",
 	"0.12.2": "0.15.0",
-	"0.12.3": "0.15.0"
+	"0.12.3": "0.15.0",
+	"0.13.0": "0.15.0"
 }
diff --git a/versions.json b/versions.json
index cf60cf10..9cc1eb5c 100644
--- a/versions.json
+++ b/versions.json
@@ -24,5 +24,6 @@
 	"0.12.0": "0.15.0",
 	"0.12.1": "0.15.0",
 	"0.12.2": "0.15.0",
-	"0.12.3": "0.15.0"
+	"0.12.3": "0.15.0",
+	"0.13.0": "0.15.0"
 }

From d93395ae48d668ff372ac8fc4dd4b46f950fa8bc Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Wed, 18 Oct 2023 12:05:54 -0700
Subject: [PATCH 61/62] Set >=6Gb RAM required for offline chat

Llama v2 7B with 4bit quantization technically needs ~3.5Gb RAM (7B * 0.5byte), practically a system with 6Gb of RAM should suffice
---
 docs/chat.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/chat.md b/docs/chat.md
index eeca3132..b900d052 100644
--- a/docs/chat.md
+++ b/docs/chat.md
@@ -10,7 +10,7 @@
 Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive.
 
 > **System Requirements**:
->  - You need at least **16 GB of RAM** and **4 GB of Disk**
+>  - Machine with at least **6 GB of RAM** and **4 GB of Disk** available
 >  - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required
 >  - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times
 

From e3f8a95784735f7cd178096efe45c8e4e4a57168 Mon Sep 17 00:00:00 2001
From: Simon Butler <simon@facetus.org.uk>
Date: Thu, 19 Oct 2023 21:28:08 +0200
Subject: [PATCH 62/62] Update emacs.md (#510)

Minor correction for emacs-lisp in minimal install
---
 docs/emacs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/emacs.md b/docs/emacs.md
index 36b9f9db..6492ecc4 100644
--- a/docs/emacs.md
+++ b/docs/emacs.md
@@ -46,7 +46,7 @@ Indexes your org-agenda files, by default.
   (use-package khoj
     :ensure t
     :pin melpa-stable
-    :bind ("C-c s" . 'khoj)
+    :bind ("C-c s" . 'khoj))
 ```
 
 - Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj