Merge pull request #892 from khoj-ai/upgrade-offline-chat-models-support

Upgrade offline chat model support. Default to Llama 3.1
This commit is contained in:
sabaimran 2024-08-20 11:51:20 -05:00 committed by GitHub
commit 4808ce778a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 42 additions and 15 deletions

View file

@ -25,7 +25,7 @@ Offline chat stays completely private and can work without internet using open-s
> - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times > - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times
1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration. 1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration.
2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Hermes-2-Pro-Mistral-7B by NousResearch](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) by default. 2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Llama 3.1 by Meta](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) by default.
:::tip[Note] :::tip[Note]

View file

@ -222,7 +222,7 @@ Using Ollama? See the [Ollama Integration](/advanced/ollama) section for more cu
Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up: Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up:
1. No need to setup a conversation processor config! 1. No need to setup a conversation processor config!
2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `NousResearch/Hermes-2-Pro-Mistral-7B-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work. 2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `bartowski/Meta-Llama-3.1-8B-Instruct-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
- Make sure to set the `model-type` to `Offline`. Do not set `openai config`. - Make sure to set the `model-type` to `Offline`. Do not set `openai config`.
- The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing. - The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing.

View file

@ -66,7 +66,7 @@ dependencies = [
"pymupdf >= 1.23.5", "pymupdf >= 1.23.5",
"django == 5.0.7", "django == 5.0.7",
"authlib == 1.2.1", "authlib == 1.2.1",
"llama-cpp-python == 0.2.82", "llama-cpp-python == 0.2.88",
"itsdangerous == 2.1.2", "itsdangerous == 2.1.2",
"httpx == 0.25.0", "httpx == 0.25.0",
"pgvector == 0.2.4", "pgvector == 0.2.4",

View file

@ -262,7 +262,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
let markdownRendered = DOMPurify.sanitize(md.render(props.message)); let markdownRendered = DOMPurify.sanitize(md.render(props.message));
return ( return (
<div <div
className={`${styles.trainOfThoughtElement} items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`} className={`${styles.trainOfThoughtElement} break-all items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
> >
{icon} {icon}
<div dangerouslySetInnerHTML={{ __html: markdownRendered }} /> <div dangerouslySetInnerHTML={{ __html: markdownRendered }} />

View file

@ -0,0 +1,17 @@
# Generated by Django 5.0.7 on 2024-08-19 12:37
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0057_remove_serverchatsettings_default_model_and_more"),
]
operations = [
migrations.AlterField(
model_name="chatmodeloptions",
name="chat_model",
field=models.CharField(default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_length=200),
),
]

View file

@ -91,7 +91,7 @@ class ChatModelOptions(BaseModel):
max_prompt_size = models.IntegerField(default=None, null=True, blank=True) max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True) subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True) tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
chat_model = models.CharField(max_length=200, default="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF") chat_model = models.CharField(max_length=200, default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF")
model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE) model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
openai_config = models.ForeignKey( openai_config = models.ForeignKey(
OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True

View file

@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
def extract_questions_offline( def extract_questions_offline(
text: str, text: str,
model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
loaded_model: Union[Any, None] = None, loaded_model: Union[Any, None] = None,
conversation_log={}, conversation_log={},
use_history: bool = True, use_history: bool = True,
@ -103,6 +103,9 @@ def extract_questions_offline(
.replace("']", '"]') .replace("']", '"]')
.replace("', '", '", "') .replace("', '", '", "')
) )
# Remove any markdown json codeblock formatting if present (useful for gemma-2)
if response.startswith("```json"):
response = response[7:-3]
questions: List[str] = json.loads(questions_str) questions: List[str] = json.loads(questions_str)
questions = filter_questions(questions) questions = filter_questions(questions)
except: except:
@ -138,7 +141,7 @@ def converse_offline(
references=[], references=[],
online_results=[], online_results=[],
conversation_log={}, conversation_log={},
model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
loaded_model: Union[Any, None] = None, loaded_model: Union[Any, None] = None,
completion_func=None, completion_func=None,
conversation_commands=[ConversationCommand.Default], conversation_commands=[ConversationCommand.Default],
@ -237,7 +240,7 @@ def llm_thread(g, messages: List[ChatMessage], model: Any, max_prompt_size: int
def send_message_to_model_offline( def send_message_to_model_offline(
messages: List[ChatMessage], messages: List[ChatMessage],
loaded_model=None, loaded_model=None,
model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
temperature: float = 0.2, temperature: float = 0.2,
streaming=False, streaming=False,
stop=[], stop=[],

View file

@ -75,6 +75,6 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int: def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int:
"""Infer max prompt size based on device memory and max context window supported by the model""" """Infer max prompt size based on device memory and max context window supported by the model"""
configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens
vram_based_n_ctx = int(get_device_memory() / 2e6) # based on heuristic vram_based_n_ctx = int(get_device_memory() / 1e6) # based on heuristic
configured_max_tokens = configured_max_tokens or math.inf # do not use if set to None configured_max_tokens = configured_max_tokens or math.inf # do not use if set to None
return min(configured_max_tokens, vram_based_n_ctx, model_context_window) return min(configured_max_tokens, vram_based_n_ctx, model_context_window)

View file

@ -587,7 +587,7 @@ You are Khoj, an advanced google search assistant. You are tasked with construct
- Official, up-to-date information about you, Khoj, is available at site:khoj.dev, github or pypi. - Official, up-to-date information about you, Khoj, is available at site:khoj.dev, github or pypi.
What Google searches, if any, will you need to perform to answer the user's question? What Google searches, if any, will you need to perform to answer the user's question?
Provide search queries as a list of strings in a JSON object. Do not wrap the json in a codeblock. Provide search queries as a list of strings in a JSON object.
Current Date: {current_date} Current Date: {current_date}
User's Location: {location} User's Location: {location}
{username} {username}

View file

@ -25,6 +25,7 @@ model_to_prompt_size = {
"gpt-4-turbo-preview": 20000, "gpt-4-turbo-preview": 20000,
"TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500, "TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500,
"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500,
"bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
} }
model_to_tokenizer: Dict[str, str] = {} model_to_tokenizer: Dict[str, str] = {}

View file

@ -279,6 +279,9 @@ async def aget_relevant_information_sources(query: str, conversation_history: di
try: try:
response = response.strip() response = response.strip()
# Remove any markdown json codeblock formatting if present (useful for gemma-2)
if response.startswith("```json"):
response = response[7:-3]
response = json.loads(response) response = json.loads(response)
response = [q.strip() for q in response["source"] if q.strip()] response = [q.strip() for q in response["source"] if q.strip()]
if not isinstance(response, list) or not response or len(response) == 0: if not isinstance(response, list) or not response or len(response) == 0:
@ -401,6 +404,9 @@ async def generate_online_subqueries(
# Validate that the response is a non-empty, JSON-serializable list # Validate that the response is a non-empty, JSON-serializable list
try: try:
response = response.strip() response = response.strip()
# Remove any markdown json codeblock formatting if present (useful for gemma-2)
if response.startswith("```json") and response.endswith("```"):
response = response[7:-3]
response = json.loads(response) response = json.loads(response)
response = [q.strip() for q in response["queries"] if q.strip()] response = [q.strip() for q in response["queries"] if q.strip()]
if not isinstance(response, list) or not response or len(response) == 0: if not isinstance(response, list) or not response or len(response) == 0:

View file

@ -70,7 +70,7 @@ class OfflineChatProcessorConfig:
class OfflineChatProcessorModel: class OfflineChatProcessorModel:
def __init__(self, chat_model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", max_tokens: int = None): def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None):
self.chat_model = chat_model self.chat_model = chat_model
self.loaded_model = None self.loaded_model = None
try: try:

View file

@ -8,7 +8,7 @@ empty_escape_sequences = "\n|\r|\t| "
app_env_filepath = "~/.khoj/env" app_env_filepath = "~/.khoj/env"
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry" telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
content_directory = "~/.khoj/content/" content_directory = "~/.khoj/content/"
default_offline_chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" default_offline_chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
default_online_chat_model = "gpt-4-turbo-preview" default_online_chat_model = "gpt-4-turbo-preview"
empty_config = { empty_config = {

View file

@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
class OfflineChatProcessorConfig(ConfigBase): class OfflineChatProcessorConfig(ConfigBase):
chat_model: Optional[str] = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" chat_model: Optional[str] = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
class ConversationProcessorConfig(ConfigBase): class ConversationProcessorConfig(ConfigBase):

View file

@ -378,7 +378,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
# Initialize Processor from Config # Initialize Processor from Config
ChatModelOptionsFactory( ChatModelOptionsFactory(
chat_model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", chat_model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
tokenizer=None, tokenizer=None,
max_prompt_size=None, max_prompt_size=None,
model_type="offline", model_type="offline",

View file

@ -49,7 +49,7 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory):
max_prompt_size = 3500 max_prompt_size = 3500
tokenizer = None tokenizer = None
chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
model_type = "offline" model_type = "offline"
openai_config = factory.LazyAttribute( openai_config = factory.LazyAttribute(
lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None