diff --git a/documentation/docs/features/chat.md b/documentation/docs/features/chat.md index ed4fe9fe..5876dc76 100644 --- a/documentation/docs/features/chat.md +++ b/documentation/docs/features/chat.md @@ -25,7 +25,7 @@ Offline chat stays completely private and can work without internet using open-s > - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times 1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration. -2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Hermes-2-Pro-Mistral-7B by NousResearch](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) by default. +2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Llama 3.1 by Meta](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) by default. :::tip[Note] diff --git a/documentation/docs/get-started/setup.mdx b/documentation/docs/get-started/setup.mdx index bc954bde..61d2ef3d 100644 --- a/documentation/docs/get-started/setup.mdx +++ b/documentation/docs/get-started/setup.mdx @@ -222,7 +222,7 @@ Using Ollama? See the [Ollama Integration](/advanced/ollama) section for more cu Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up: 1. No need to setup a conversation processor config! -2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `NousResearch/Hermes-2-Pro-Mistral-7B-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work. +2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `bartowski/Meta-Llama-3.1-8B-Instruct-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work. - Make sure to set the `model-type` to `Offline`. Do not set `openai config`. - The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing. diff --git a/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py b/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py new file mode 100644 index 00000000..ea4515e1 --- /dev/null +++ b/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.7 on 2024-08-19 12:37 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0057_remove_serverchatsettings_default_model_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="chatmodeloptions", + name="chat_model", + field=models.CharField(default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_length=200), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 72c93157..2468ffc9 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -91,7 +91,7 @@ class ChatModelOptions(BaseModel): max_prompt_size = models.IntegerField(default=None, null=True, blank=True) subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True) tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True) - chat_model = models.CharField(max_length=200, default="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF") + chat_model = models.CharField(max_length=200, default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF") model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE) openai_config = models.ForeignKey( OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True diff --git a/src/khoj/processor/conversation/offline/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py index 1251dcec..c62d1e00 100644 --- a/src/khoj/processor/conversation/offline/chat_model.py +++ b/src/khoj/processor/conversation/offline/chat_model.py @@ -24,7 +24,7 @@ logger = logging.getLogger(__name__) def extract_questions_offline( text: str, - model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", + model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", loaded_model: Union[Any, None] = None, conversation_log={}, use_history: bool = True, @@ -141,7 +141,7 @@ def converse_offline( references=[], online_results=[], conversation_log={}, - model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", + model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", loaded_model: Union[Any, None] = None, completion_func=None, conversation_commands=[ConversationCommand.Default], @@ -240,7 +240,7 @@ def llm_thread(g, messages: List[ChatMessage], model: Any, max_prompt_size: int def send_message_to_model_offline( messages: List[ChatMessage], loaded_model=None, - model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", + model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", temperature: float = 0.2, streaming=False, stop=[], diff --git a/src/khoj/processor/conversation/offline/utils.py b/src/khoj/processor/conversation/offline/utils.py index 66017b36..88082ad1 100644 --- a/src/khoj/processor/conversation/offline/utils.py +++ b/src/khoj/processor/conversation/offline/utils.py @@ -75,6 +75,6 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"): def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int: """Infer max prompt size based on device memory and max context window supported by the model""" configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens - vram_based_n_ctx = int(get_device_memory() / 2e6) # based on heuristic + vram_based_n_ctx = int(get_device_memory() / 1e6) # based on heuristic configured_max_tokens = configured_max_tokens or math.inf # do not use if set to None return min(configured_max_tokens, vram_based_n_ctx, model_context_window) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index ea7368e6..251ac197 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -25,6 +25,7 @@ model_to_prompt_size = { "gpt-4-turbo-preview": 20000, "TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500, + "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000, } model_to_tokenizer: Dict[str, str] = {} diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index 0e88075f..03dad75c 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -70,7 +70,7 @@ class OfflineChatProcessorConfig: class OfflineChatProcessorModel: - def __init__(self, chat_model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", max_tokens: int = None): + def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None): self.chat_model = chat_model self.loaded_model = None try: diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index c3d8a186..9b7ffb77 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -8,7 +8,7 @@ empty_escape_sequences = "\n|\r|\t| " app_env_filepath = "~/.khoj/env" telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry" content_directory = "~/.khoj/content/" -default_offline_chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" +default_offline_chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF" default_online_chat_model = "gpt-4-turbo-preview" empty_config = { diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 617f37ea..6a788531 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase): class OfflineChatProcessorConfig(ConfigBase): - chat_model: Optional[str] = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" + chat_model: Optional[str] = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF" class ConversationProcessorConfig(ConfigBase): diff --git a/tests/conftest.py b/tests/conftest.py index 61578ce2..0fe9d360 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -378,7 +378,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser): # Initialize Processor from Config ChatModelOptionsFactory( - chat_model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", + chat_model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", tokenizer=None, max_prompt_size=None, model_type="offline", diff --git a/tests/helpers.py b/tests/helpers.py index 7894ffa2..2e8e5671 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -49,7 +49,7 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory): max_prompt_size = 3500 tokenizer = None - chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" + chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF" model_type = "offline" openai_config = factory.LazyAttribute( lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None