Upgrade to latest GPT4All. Use Mistral as default offline chat model

GPT4all now supports gguf llama.cpp chat models. Latest GPT4All (+mistral) performs much at least 3x faster. On Macbook Pro at ~10s response start time vs 30s-120s earlier. Mistral is also a better chat model, although it hallucinates more than llama-2
2024-11-27 17:35:07 +01:00 · 2023-10-22 18:16:02 -07:00 · 2023-10-22 18:16:02 -07:00 · 0f1ebcae18
commit 0f1ebcae18
parent 6dc0df3afb
10 changed files with 84 additions and 11 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -60,8 +60,8 @@ dependencies = [
    "bs4 >= 0.0.1",
    "anyio == 3.7.1",
    "pymupdf >= 1.23.3",
-    "gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
-    "gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
+    "gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'",
 ]
 dynamic = ["version"]
--- a/src/khoj/migrations/migrate_offline_chat_default_model.py
+++ b/src/khoj/migrations/migrate_offline_chat_default_model.py
@ -0,0 +1,69 @@
 """
 Current format of khoj.yml
 ---
 app:
    ...
 content-type:
    ...
 processor:
  conversation:
    offline-chat:
        enable-offline-chat: false
        chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
    ...
 search-type:
    ...
 New format of khoj.yml
 ---
 app:
    ...
 content-type:
    ...
 processor:
  conversation:
    offline-chat:
        enable-offline-chat: false
        chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
    ...
 search-type:
    ...
 """
 import logging
 from packaging import version
 from khoj.utils.yaml import load_config_from_file, save_config_to_file
 logger = logging.getLogger(__name__)
 def migrate_offline_chat_default_model(args):
    schema_version = "0.12.4"
    raw_config = load_config_from_file(args.config_file)
    previous_version = raw_config.get("version")
    if "processor" not in raw_config:
        return args
    if raw_config["processor"] is None:
        return args
    if "conversation" not in raw_config["processor"]:
        return args
    if "offline-chat" not in raw_config["processor"]["conversation"]:
        return args
    if "chat-model" not in raw_config["processor"]["conversation"]["offline-chat"]:
        return args
    if previous_version is None or version.parse(previous_version) < version.parse("0.12.4"):
        logger.info(
            f"Upgrading config schema to {schema_version} from {previous_version} to change default (offline) chat model to mistral GGUF"
        )
        raw_config["version"] = schema_version
        # Update offline chat model to mistral in GGUF format to use latest GPT4All
        offline_chat_model = raw_config["processor"]["conversation"]["offline-chat"]["chat-model"]
        if offline_chat_model.endswith(".bin"):
            raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
        save_config_to_file(raw_config, args.config_file)
    return args
--- a/src/khoj/processor/conversation/gpt4all/chat_model.py
+++ b/src/khoj/processor/conversation/gpt4all/chat_model.py
@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
 def extract_questions_offline(
    text: str,
-    model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
+    model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
    loaded_model: Union[Any, None] = None,
    conversation_log={},
    use_history: bool = True,
@ -123,7 +123,7 @@ def converse_offline(
    references,
    user_query,
    conversation_log={},
-    model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
+    model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
    loaded_model: Union[Any, None] = None,
    completion_func=None,
    conversation_command=ConversationCommand.Default,
--- a/src/khoj/processor/conversation/gpt4all/utils.py
+++ b/src/khoj/processor/conversation/gpt4all/utils.py
@ -14,9 +14,9 @@ def download_model(model_name: str):
    # Use GPU for Chat Model, if available
    try:
        model = GPT4All(model_name=model_name, device="gpu")
-        logger.debug("Loaded chat model to GPU.")
+        logger.debug(f"Loaded {model_name} chat model to GPU.")
    except ValueError:
        model = GPT4All(model_name=model_name)
-        logger.debug("Loaded chat model to CPU.")
+        logger.debug(f"Loaded {model_name} chat model to CPU.")
    return model
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@ -20,9 +20,11 @@ model_to_prompt_size = {
    "gpt-4": 8192,
    "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
    "gpt-3.5-turbo-16k": 15000,
    "mistral-7b-instruct-v0.1.Q4_0.gguf": 1548,
 }
 model_to_tokenizer = {
    "llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
    "mistral-7b-instruct-v0.1.Q4_0.gguf": "mistralai/Mistral-7B-Instruct-v0.1",
 }
--- a/src/khoj/utils/cli.py
+++ b/src/khoj/utils/cli.py
@ -10,6 +10,7 @@ from khoj.migrations.migrate_version import migrate_config_to_version
 from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
 from khoj.migrations.migrate_offline_model import migrate_offline_model
 from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
 from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model
 def cli(args=None):
@ -61,6 +62,7 @@ def run_migrations(args):
        migrate_processor_conversation_schema,
        migrate_offline_model,
        migrate_offline_chat_schema,
        migrate_offline_chat_default_model,
    ]
    for migration in migrations:
        args = migration(args)
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@ -55,7 +55,7 @@ empty_config = {
            },
            "offline-chat": {
                "enable-offline-chat": False,
-                "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
+                "chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf",
            },
            "tokenizer": None,
            "max-prompt-size": None,
@ -132,7 +132,7 @@ default_config = {
            },
            "offline-chat": {
                "enable-offline-chat": False,
-                "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
+                "chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf",
            },
            "tokenizer": None,
            "max-prompt-size": None,
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
 class OfflineChatProcessorConfig(ConfigBase):
    enable_offline_chat: Optional[bool] = False
-    chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+    chat_model: Optional[str] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
 class ConversationProcessorConfig(ConfigBase):
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -206,7 +206,7 @@ def processor_config_offline_chat(tmp_path_factory):
    # Setup conversation processor
    processor_config = ProcessorConfig()
-    offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
+    offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True, chat_model="mistral-7b-instruct-v0.1.Q4_0.gguf")
    processor_config.conversation = ConversationProcessorConfig(
        offline_chat=offline_chat,
        conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
--- a/tests/test_gpt4all_chat_actors.py
+++ b/tests/test_gpt4all_chat_actors.py
@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
 from khoj.processor.conversation.utils import message_to_log
-MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+MODEL_NAME = "mistral-7b-instruct-v0.1.Q4_0.gguf"
@pytest.fixture(scope="session")