From 6dc0df3afbdcc2913fc65461cf07d4ba92273924 Mon Sep 17 00:00:00 2001 From: sabaimran <65192171+sabaimran@users.noreply.github.com> Date: Fri, 20 Oct 2023 14:10:21 -0700 Subject: [PATCH 1/5] Pin pytorch version to 2.0.1 in order to avoid exit code 139 in Docker container (#512) --- docker-compose.yml | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 5f1bb1f9..bc3da2a9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -25,9 +25,11 @@ services: - ./tests/data/embeddings/:/root/.khoj/content/ - ./tests/data/models/:/root/.khoj/search/ - khoj_config:/root/.khoj/ + - sentence_tranformer_models:/root/.cache/torch/sentence_transformers # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ command: --host="0.0.0.0" --port=42110 -vv volumes: khoj_config: + sentence_tranformer_models: diff --git a/pyproject.toml b/pyproject.toml index b3529aa9..f0cc2234 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ dependencies = [ "schedule == 1.1.0", "sentence-transformers == 2.2.2", "transformers >= 4.28.0", - "torch >= 2.0.1", + "torch == 2.0.1", "uvicorn == 0.17.6", "aiohttp == 3.8.5", "langchain >= 0.0.187", From 0f1ebcae18abc8969cb367564077ef8d20695be3 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 22 Oct 2023 18:16:02 -0700 Subject: [PATCH 2/5] Upgrade to latest GPT4All. Use Mistral as default offline chat model GPT4all now supports gguf llama.cpp chat models. Latest GPT4All (+mistral) performs much at least 3x faster. On Macbook Pro at ~10s response start time vs 30s-120s earlier. Mistral is also a better chat model, although it hallucinates more than llama-2 --- pyproject.toml | 4 +- .../migrate_offline_chat_default_model.py | 69 +++++++++++++++++++ .../conversation/gpt4all/chat_model.py | 4 +- .../processor/conversation/gpt4all/utils.py | 4 +- src/khoj/processor/conversation/utils.py | 2 + src/khoj/utils/cli.py | 2 + src/khoj/utils/constants.py | 4 +- src/khoj/utils/rawconfig.py | 2 +- tests/conftest.py | 2 +- tests/test_gpt4all_chat_actors.py | 2 +- 10 files changed, 84 insertions(+), 11 deletions(-) create mode 100644 src/khoj/migrations/migrate_offline_chat_default_model.py diff --git a/pyproject.toml b/pyproject.toml index f0cc2234..bac662a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,8 +60,8 @@ dependencies = [ "bs4 >= 0.0.1", "anyio == 3.7.1", "pymupdf >= 1.23.3", - "gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'", - "gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'", + "gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'", + "gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'", ] dynamic = ["version"] diff --git a/src/khoj/migrations/migrate_offline_chat_default_model.py b/src/khoj/migrations/migrate_offline_chat_default_model.py new file mode 100644 index 00000000..b7f4cf94 --- /dev/null +++ b/src/khoj/migrations/migrate_offline_chat_default_model.py @@ -0,0 +1,69 @@ +""" +Current format of khoj.yml +--- +app: + ... +content-type: + ... +processor: + conversation: + offline-chat: + enable-offline-chat: false + chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin + ... +search-type: + ... + +New format of khoj.yml +--- +app: + ... +content-type: + ... +processor: + conversation: + offline-chat: + enable-offline-chat: false + chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf + ... +search-type: + ... +""" +import logging +from packaging import version + +from khoj.utils.yaml import load_config_from_file, save_config_to_file + + +logger = logging.getLogger(__name__) + + +def migrate_offline_chat_default_model(args): + schema_version = "0.12.4" + raw_config = load_config_from_file(args.config_file) + previous_version = raw_config.get("version") + + if "processor" not in raw_config: + return args + if raw_config["processor"] is None: + return args + if "conversation" not in raw_config["processor"]: + return args + if "offline-chat" not in raw_config["processor"]["conversation"]: + return args + if "chat-model" not in raw_config["processor"]["conversation"]["offline-chat"]: + return args + + if previous_version is None or version.parse(previous_version) < version.parse("0.12.4"): + logger.info( + f"Upgrading config schema to {schema_version} from {previous_version} to change default (offline) chat model to mistral GGUF" + ) + raw_config["version"] = schema_version + + # Update offline chat model to mistral in GGUF format to use latest GPT4All + offline_chat_model = raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] + if offline_chat_model.endswith(".bin"): + raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] = "mistral-7b-instruct-v0.1.Q4_0.gguf" + + save_config_to_file(raw_config, args.config_file) + return args diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py index 7e92d002..04a004f0 100644 --- a/src/khoj/processor/conversation/gpt4all/chat_model.py +++ b/src/khoj/processor/conversation/gpt4all/chat_model.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) def extract_questions_offline( text: str, - model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", + model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf", loaded_model: Union[Any, None] = None, conversation_log={}, use_history: bool = True, @@ -123,7 +123,7 @@ def converse_offline( references, user_query, conversation_log={}, - model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", + model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf", loaded_model: Union[Any, None] = None, completion_func=None, conversation_command=ConversationCommand.Default, diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py index d5201780..2bb1fbbc 100644 --- a/src/khoj/processor/conversation/gpt4all/utils.py +++ b/src/khoj/processor/conversation/gpt4all/utils.py @@ -14,9 +14,9 @@ def download_model(model_name: str): # Use GPU for Chat Model, if available try: model = GPT4All(model_name=model_name, device="gpu") - logger.debug("Loaded chat model to GPU.") + logger.debug(f"Loaded {model_name} chat model to GPU.") except ValueError: model = GPT4All(model_name=model_name) - logger.debug("Loaded chat model to CPU.") + logger.debug(f"Loaded {model_name} chat model to CPU.") return model diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index 83d51f2d..b0d401fa 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -20,9 +20,11 @@ model_to_prompt_size = { "gpt-4": 8192, "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548, "gpt-3.5-turbo-16k": 15000, + "mistral-7b-instruct-v0.1.Q4_0.gguf": 1548, } model_to_tokenizer = { "llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer", + "mistral-7b-instruct-v0.1.Q4_0.gguf": "mistralai/Mistral-7B-Instruct-v0.1", } diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py index 1d6106cb..7c72b101 100644 --- a/src/khoj/utils/cli.py +++ b/src/khoj/utils/cli.py @@ -10,6 +10,7 @@ from khoj.migrations.migrate_version import migrate_config_to_version from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema from khoj.migrations.migrate_offline_model import migrate_offline_model from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema +from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model def cli(args=None): @@ -61,6 +62,7 @@ def run_migrations(args): migrate_processor_conversation_schema, migrate_offline_model, migrate_offline_chat_schema, + migrate_offline_chat_default_model, ] for migration in migrations: args = migration(args) diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index 9ed97798..7f534bf6 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -55,7 +55,7 @@ empty_config = { }, "offline-chat": { "enable-offline-chat": False, - "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin", + "chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf", }, "tokenizer": None, "max-prompt-size": None, @@ -132,7 +132,7 @@ default_config = { }, "offline-chat": { "enable-offline-chat": False, - "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin", + "chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf", }, "tokenizer": None, "max-prompt-size": None, diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index f7c42266..cc4fe208 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase): class OfflineChatProcessorConfig(ConfigBase): enable_offline_chat: Optional[bool] = False - chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin" + chat_model: Optional[str] = "mistral-7b-instruct-v0.1.Q4_0.gguf" class ConversationProcessorConfig(ConfigBase): diff --git a/tests/conftest.py b/tests/conftest.py index f75dfceb..8b661f50 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -206,7 +206,7 @@ def processor_config_offline_chat(tmp_path_factory): # Setup conversation processor processor_config = ProcessorConfig() - offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True) + offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True, chat_model="mistral-7b-instruct-v0.1.Q4_0.gguf") processor_config.conversation = ConversationProcessorConfig( offline_chat=offline_chat, conversation_logfile=processor_dir.joinpath("conversation_logs.json"), diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py index 76ed26e7..782b54f2 100644 --- a/tests/test_gpt4all_chat_actors.py +++ b/tests/test_gpt4all_chat_actors.py @@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model from khoj.processor.conversation.utils import message_to_log -MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin" +MODEL_NAME = "mistral-7b-instruct-v0.1.Q4_0.gguf" @pytest.fixture(scope="session") From 5bb14a05a06b5fd80d6ed42bf93e8953f7791375 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 22 Oct 2023 18:43:02 -0700 Subject: [PATCH 3/5] Update system requirements in docs for offline chat models --- docs/chat.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/chat.md b/docs/chat.md index b900d052..2efd7b1b 100644 --- a/docs/chat.md +++ b/docs/chat.md @@ -10,7 +10,8 @@ Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive. > **System Requirements**: -> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available +> - Minimum 8 GB RAM. Recommend **16Gb VRAM** +> - Minimum **5 GB of Disk** available > - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required > - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times From 9677eae79192aed2171a433f4ae4d9adff7afba1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 25 Oct 2023 17:51:46 -0700 Subject: [PATCH 4/5] Expose CLI flag to disable using GPU for offline chat model - Offline chat models outputing gibberish when loaded onto some GPU. GPU support with Vulkan in GPT4All seems a bit buggy - This change mitigates the upstream issue by allowing user to manually disable using GPU for offline chat Closes #516 --- src/khoj/main.py | 1 + src/khoj/processor/conversation/gpt4all/utils.py | 7 +++++-- src/khoj/utils/cli.py | 6 ++++++ src/khoj/utils/state.py | 2 ++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/khoj/main.py b/src/khoj/main.py index 7b1bfd7e..4c759c2a 100644 --- a/src/khoj/main.py +++ b/src/khoj/main.py @@ -94,6 +94,7 @@ def set_state(args): state.port = args.port state.demo = args.demo state.khoj_version = version("khoj-assistant") + state.chat_on_gpu = args.chat_on_gpu def start_server(app, host=None, port=None, socket=None): diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py index 2bb1fbbc..45a1158e 100644 --- a/src/khoj/processor/conversation/gpt4all/utils.py +++ b/src/khoj/processor/conversation/gpt4all/utils.py @@ -1,5 +1,7 @@ import logging +from khoj.utils import state + logger = logging.getLogger(__name__) @@ -13,8 +15,9 @@ def download_model(model_name: str): # Use GPU for Chat Model, if available try: - model = GPT4All(model_name=model_name, device="gpu") - logger.debug(f"Loaded {model_name} chat model to GPU.") + device = "gpu" if state.chat_on_gpu else "cpu" + model = GPT4All(model_name=model_name, device=device) + logger.debug(f"Loaded {model_name} chat model to {device.upper()}") except ValueError: model = GPT4All(model_name=model_name) logger.debug(f"Loaded {model_name} chat model to CPU.") diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py index 7c72b101..9f129b17 100644 --- a/src/khoj/utils/cli.py +++ b/src/khoj/utils/cli.py @@ -34,10 +34,16 @@ def cli(args=None): help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock", ) parser.add_argument("--version", "-V", action="store_true", help="Print the installed Khoj version and exit") + parser.add_argument( + "--disable-chat-on-gpu", action="store_true", default=False, help="Disable using GPU for the offline chat model" + ) parser.add_argument("--demo", action="store_true", default=False, help="Run Khoj in demo mode") args = parser.parse_args(args) + # Set default values for arguments + args.chat_on_gpu = not args.disable_chat_on_gpu + args.version_no = version("khoj-assistant") if args.version: # Show version of khoj installed and exit diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index 5ac8a838..e9b2ca6c 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -31,6 +31,8 @@ telemetry: List[Dict[str, str]] = [] previous_query: str = None demo: bool = False khoj_version: str = None +chat_on_gpu: bool = True + if torch.cuda.is_available(): # Use CUDA GPU From 354605e73e659d0443c67dffe56b226a16dc6d23 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 2 Nov 2023 06:09:45 +0700 Subject: [PATCH 5/5] Autofocus to chat input when openning chat (#524) --- src/interface/obsidian/src/chat_modal.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/interface/obsidian/src/chat_modal.ts b/src/interface/obsidian/src/chat_modal.ts index 66381071..d390cbf2 100644 --- a/src/interface/obsidian/src/chat_modal.ts +++ b/src/interface/obsidian/src/chat_modal.ts @@ -38,7 +38,7 @@ export class KhojChatModal extends Modal { await this.getChatHistory(); // Add chat input field - contentEl.createEl("input", + const chatInput = contentEl.createEl("input", { attr: { type: "text", @@ -48,10 +48,11 @@ export class KhojChatModal extends Modal { class: "khoj-chat-input option" } }) - .addEventListener('change', (event) => { this.result = (event.target).value }); + chatInput.addEventListener('change', (event) => { this.result = (event.target).value }); // Scroll to bottom of modal, till the send message input box this.modalEl.scrollTop = this.modalEl.scrollHeight; + chatInput.focus(); } generateReference(messageEl: any, reference: string, index: number) {