diff --git a/pyproject.toml b/pyproject.toml index 12ef261b..26d08c25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ dependencies = [ "pymupdf >= 1.23.5", "django == 4.2.10", "authlib == 1.2.1", - "llama-cpp-python == 0.2.56", + "llama-cpp-python == 0.2.64", "itsdangerous == 2.1.2", "httpx == 0.25.0", "pgvector == 0.2.4", diff --git a/src/khoj/processor/conversation/offline/utils.py b/src/khoj/processor/conversation/offline/utils.py index c43c7353..05de4b9f 100644 --- a/src/khoj/processor/conversation/offline/utils.py +++ b/src/khoj/processor/conversation/offline/utils.py @@ -2,6 +2,7 @@ import glob import logging import math import os +from typing import Any, Dict from huggingface_hub.constants import HF_HUB_CACHE @@ -14,12 +15,16 @@ logger = logging.getLogger(__name__) def download_model(repo_id: str, filename: str = "*Q4_K_M.gguf", max_tokens: int = None): # Initialize Model Parameters # Use n_ctx=0 to get context size from the model - kwargs = {"n_threads": 4, "n_ctx": 0, "verbose": False} + kwargs: Dict[str, Any] = {"n_threads": 4, "n_ctx": 0, "verbose": False} # Decide whether to load model to GPU or CPU device = "gpu" if state.chat_on_gpu and state.device != "cpu" else "cpu" kwargs["n_gpu_layers"] = -1 if device == "gpu" else 0 + # Add chat format if known + if "llama-3" in repo_id.lower(): + kwargs["chat_format"] = "llama-3" + # Check if the model is already downloaded model_path = load_model_from_cache(repo_id, filename) chat_model = None