From 2dd15e9f632d045bd7715c25c8dbc8d9423c31e3 Mon Sep 17 00:00:00 2001
From: sabaimran <65192171+sabaimran@users.noreply.github.com>
Date: Mon, 18 Sep 2023 14:41:26 -0700
Subject: [PATCH] Resolve issues with GPT4All and fix prompt for yesterday
 extract questions date filter (#483)

- GPT4All integration had ceased working with 0.1.7 specification. Update to use 1.0.12. At a later date, we should also use first party support for llama v2 via gpt4all
- Update the system prompt for the extract_questions flow to add start and end date to the yesterday date filter example.
- Update all setup data in conftest.py to use new client-server indexing pattern
---
 pyproject.toml                                |  4 +-
 .../processor/conversation/gpt4all/utils.py   |  7 ++--
 src/khoj/processor/conversation/prompts.py    |  2 +-
 tests/conftest.py                             | 37 +++++--------------
 4 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 61d6bef3..a52fc9b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,8 +59,8 @@ dependencies = [
     "bs4 >= 0.0.1",
     "anyio == 3.7.1",
     "pymupdf >= 1.23.3",
-    "gpt4all == 0.1.9; platform_system == 'Linux' and platform_machine == 'x86_64'",
-    "gpt4all == 0.1.9; platform_system == 'Windows' or platform_system == 'Darwin'",
+    "gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
 ]
 dynamic = ["version"]
 
diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py
index 95eeb496..4042fbe2 100644
--- a/src/khoj/processor/conversation/gpt4all/utils.py
+++ b/src/khoj/processor/conversation/gpt4all/utils.py
@@ -28,9 +28,10 @@ def download_model(model_name: str):
         raise e
 
     url = model_metadata.model_name_to_url.get(model_name)
+    model_path = os.path.expanduser(f"~/.cache/gpt4all/")
     if not url:
         logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
-        return GPT4All(model_name)
+        return GPT4All(model_name=model_name, model_path=model_path)
 
     filename = os.path.expanduser(f"~/.cache/gpt4all/{model_name}")
     if os.path.exists(filename):
@@ -39,8 +40,8 @@ def download_model(model_name: str):
             requests.get("https://www.google.com/", timeout=5)
         except:
             logger.debug("User is offline. Disabling allowed download flag")
-            return GPT4All(model_name, allow_download=False)
-        return GPT4All(model_name)
+            return GPT4All(model_name=model_name, model_path=model_path, allow_download=False)
+        return GPT4All(model_name=model_name, model_path=model_path)
 
     # Download the model to a tmp file. Once the download is completed, move the tmp file to the actual file
     tmp_filename = filename + ".tmp"
diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
index cb9ecdcc..4de3c623 100644
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -209,7 +209,7 @@ A: Bob is {bob_tom_age_difference} years older than Tom. As Bob is {bob_age} yea
 
 Q: What does yesterday's note say?
 
-["Note from {yesterday_date} dt='{yesterday_date}'"]
+["Note from {yesterday_date} dt>='{yesterday_date}' dt<'{current_date}'"]
 
 A: Yesterday's note contains the following information: ...
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 45df8ffb..be332eae 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,7 +26,8 @@ from khoj.utils.rawconfig import (
     TextSearchConfig,
     ImageSearchConfig,
 )
-from khoj.utils import state
+from khoj.utils import state, fs_syncer
+from khoj.routers.indexer import configure_content
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.search_filter.date_filter import DateFilter
@@ -220,15 +221,10 @@ def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, p
     state.SearchType = configure_search_types(state.config)
 
     # Index Markdown Content for Search
-    filters = [DateFilter(), WordFilter(), FileFilter()]
     state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
-    state.content_index.markdown = text_search.setup(
-        MarkdownToJsonl,
-        get_sample_data("markdown"),
-        md_content_config.markdown,
-        state.search_models.text_search.bi_encoder,
-        regenerate=False,
-        filters=filters,
+    all_files = fs_syncer.collect_files(state.config.content_type)
+    state.content_index = configure_content(
+        state.content_index, state.config.content_type, all_files, state.search_models
     )
 
     # Initialize Processor from Config
@@ -273,7 +269,7 @@ def client(content_config: ContentConfig, search_config: SearchConfig, processor
 
 @pytest.fixture(scope="function")
 def client_offline_chat(
-    md_content_config: ContentConfig, search_config: SearchConfig, processor_config_offline_chat: ProcessorConfig
+    search_config: SearchConfig, processor_config_offline_chat: ProcessorConfig, content_config: ContentConfig
 ):
     # Initialize app state
     state.config.content_type = md_content_config
@@ -281,27 +277,12 @@ def client_offline_chat(
     state.SearchType = configure_search_types(state.config)
 
     # Index Markdown Content for Search
-    filters = [DateFilter(), WordFilter(), FileFilter()]
     state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
     state.search_models.image_search = image_search.initialize_model(search_config.image)
-    state.content_index.org = text_search.setup(
-        OrgToJsonl,
-        get_sample_data("org"),
-        content_config.org,
-        state.search_models.text_search.bi_encoder,
-        regenerate=False,
-    )
-    state.content_index.image = image_search.setup(
-        content_config.image, state.search_models.image_search, regenerate=False
-    )
 
-    state.content_index.markdown = text_search.setup(
-        MarkdownToJsonl,
-        get_sample_data("markdown"),
-        md_content_config.markdown,
-        state.search_models.text_search.bi_encoder,
-        regenerate=False,
-        filters=filters,
+    all_files = fs_syncer.collect_files(content_config.content_type)
+    state.content_index = configure_content(
+        state.content_index, state.config.content_type, all_files, state.search_models
     )
 
     # Initialize Processor from Config