mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Merge pull request #892 from khoj-ai/upgrade-offline-chat-models-support
Upgrade offline chat model support. Default to Llama 3.1
This commit is contained in:
commit
4808ce778a
16 changed files with 42 additions and 15 deletions
|
@ -25,7 +25,7 @@ Offline chat stays completely private and can work without internet using open-s
|
||||||
> - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times
|
> - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times
|
||||||
|
|
||||||
1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration.
|
1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration.
|
||||||
2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Hermes-2-Pro-Mistral-7B by NousResearch](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) by default.
|
2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Llama 3.1 by Meta](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) by default.
|
||||||
|
|
||||||
|
|
||||||
:::tip[Note]
|
:::tip[Note]
|
||||||
|
|
|
@ -222,7 +222,7 @@ Using Ollama? See the [Ollama Integration](/advanced/ollama) section for more cu
|
||||||
Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up:
|
Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up:
|
||||||
|
|
||||||
1. No need to setup a conversation processor config!
|
1. No need to setup a conversation processor config!
|
||||||
2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `NousResearch/Hermes-2-Pro-Mistral-7B-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
|
2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `bartowski/Meta-Llama-3.1-8B-Instruct-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
|
||||||
- Make sure to set the `model-type` to `Offline`. Do not set `openai config`.
|
- Make sure to set the `model-type` to `Offline`. Do not set `openai config`.
|
||||||
- The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing.
|
- The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing.
|
||||||
|
|
||||||
|
|
|
@ -66,7 +66,7 @@ dependencies = [
|
||||||
"pymupdf >= 1.23.5",
|
"pymupdf >= 1.23.5",
|
||||||
"django == 5.0.7",
|
"django == 5.0.7",
|
||||||
"authlib == 1.2.1",
|
"authlib == 1.2.1",
|
||||||
"llama-cpp-python == 0.2.82",
|
"llama-cpp-python == 0.2.88",
|
||||||
"itsdangerous == 2.1.2",
|
"itsdangerous == 2.1.2",
|
||||||
"httpx == 0.25.0",
|
"httpx == 0.25.0",
|
||||||
"pgvector == 0.2.4",
|
"pgvector == 0.2.4",
|
||||||
|
|
|
@ -262,7 +262,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
|
||||||
let markdownRendered = DOMPurify.sanitize(md.render(props.message));
|
let markdownRendered = DOMPurify.sanitize(md.render(props.message));
|
||||||
return (
|
return (
|
||||||
<div
|
<div
|
||||||
className={`${styles.trainOfThoughtElement} items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
|
className={`${styles.trainOfThoughtElement} break-all items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
|
||||||
>
|
>
|
||||||
{icon}
|
{icon}
|
||||||
<div dangerouslySetInnerHTML={{ __html: markdownRendered }} />
|
<div dangerouslySetInnerHTML={{ __html: markdownRendered }} />
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
# Generated by Django 5.0.7 on 2024-08-19 12:37
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
dependencies = [
|
||||||
|
("database", "0057_remove_serverchatsettings_default_model_and_more"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name="chatmodeloptions",
|
||||||
|
name="chat_model",
|
||||||
|
field=models.CharField(default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_length=200),
|
||||||
|
),
|
||||||
|
]
|
|
@ -91,7 +91,7 @@ class ChatModelOptions(BaseModel):
|
||||||
max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
|
max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
|
||||||
subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
|
subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
|
||||||
tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
|
tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
|
||||||
chat_model = models.CharField(max_length=200, default="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF")
|
chat_model = models.CharField(max_length=200, default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF")
|
||||||
model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
|
model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
|
||||||
openai_config = models.ForeignKey(
|
openai_config = models.ForeignKey(
|
||||||
OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True
|
OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True
|
||||||
|
|
|
@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def extract_questions_offline(
|
def extract_questions_offline(
|
||||||
text: str,
|
text: str,
|
||||||
model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
|
model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
|
||||||
loaded_model: Union[Any, None] = None,
|
loaded_model: Union[Any, None] = None,
|
||||||
conversation_log={},
|
conversation_log={},
|
||||||
use_history: bool = True,
|
use_history: bool = True,
|
||||||
|
@ -103,6 +103,9 @@ def extract_questions_offline(
|
||||||
.replace("']", '"]')
|
.replace("']", '"]')
|
||||||
.replace("', '", '", "')
|
.replace("', '", '", "')
|
||||||
)
|
)
|
||||||
|
# Remove any markdown json codeblock formatting if present (useful for gemma-2)
|
||||||
|
if response.startswith("```json"):
|
||||||
|
response = response[7:-3]
|
||||||
questions: List[str] = json.loads(questions_str)
|
questions: List[str] = json.loads(questions_str)
|
||||||
questions = filter_questions(questions)
|
questions = filter_questions(questions)
|
||||||
except:
|
except:
|
||||||
|
@ -138,7 +141,7 @@ def converse_offline(
|
||||||
references=[],
|
references=[],
|
||||||
online_results=[],
|
online_results=[],
|
||||||
conversation_log={},
|
conversation_log={},
|
||||||
model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
|
model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
|
||||||
loaded_model: Union[Any, None] = None,
|
loaded_model: Union[Any, None] = None,
|
||||||
completion_func=None,
|
completion_func=None,
|
||||||
conversation_commands=[ConversationCommand.Default],
|
conversation_commands=[ConversationCommand.Default],
|
||||||
|
@ -237,7 +240,7 @@ def llm_thread(g, messages: List[ChatMessage], model: Any, max_prompt_size: int
|
||||||
def send_message_to_model_offline(
|
def send_message_to_model_offline(
|
||||||
messages: List[ChatMessage],
|
messages: List[ChatMessage],
|
||||||
loaded_model=None,
|
loaded_model=None,
|
||||||
model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
|
model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
|
||||||
temperature: float = 0.2,
|
temperature: float = 0.2,
|
||||||
streaming=False,
|
streaming=False,
|
||||||
stop=[],
|
stop=[],
|
||||||
|
|
|
@ -75,6 +75,6 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
|
||||||
def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int:
|
def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int:
|
||||||
"""Infer max prompt size based on device memory and max context window supported by the model"""
|
"""Infer max prompt size based on device memory and max context window supported by the model"""
|
||||||
configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens
|
configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens
|
||||||
vram_based_n_ctx = int(get_device_memory() / 2e6) # based on heuristic
|
vram_based_n_ctx = int(get_device_memory() / 1e6) # based on heuristic
|
||||||
configured_max_tokens = configured_max_tokens or math.inf # do not use if set to None
|
configured_max_tokens = configured_max_tokens or math.inf # do not use if set to None
|
||||||
return min(configured_max_tokens, vram_based_n_ctx, model_context_window)
|
return min(configured_max_tokens, vram_based_n_ctx, model_context_window)
|
||||||
|
|
|
@ -587,7 +587,7 @@ You are Khoj, an advanced google search assistant. You are tasked with construct
|
||||||
- Official, up-to-date information about you, Khoj, is available at site:khoj.dev, github or pypi.
|
- Official, up-to-date information about you, Khoj, is available at site:khoj.dev, github or pypi.
|
||||||
|
|
||||||
What Google searches, if any, will you need to perform to answer the user's question?
|
What Google searches, if any, will you need to perform to answer the user's question?
|
||||||
Provide search queries as a list of strings in a JSON object. Do not wrap the json in a codeblock.
|
Provide search queries as a list of strings in a JSON object.
|
||||||
Current Date: {current_date}
|
Current Date: {current_date}
|
||||||
User's Location: {location}
|
User's Location: {location}
|
||||||
{username}
|
{username}
|
||||||
|
|
|
@ -25,6 +25,7 @@ model_to_prompt_size = {
|
||||||
"gpt-4-turbo-preview": 20000,
|
"gpt-4-turbo-preview": 20000,
|
||||||
"TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500,
|
"TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500,
|
||||||
"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500,
|
"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500,
|
||||||
|
"bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
|
||||||
}
|
}
|
||||||
model_to_tokenizer: Dict[str, str] = {}
|
model_to_tokenizer: Dict[str, str] = {}
|
||||||
|
|
||||||
|
|
|
@ -279,6 +279,9 @@ async def aget_relevant_information_sources(query: str, conversation_history: di
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = response.strip()
|
response = response.strip()
|
||||||
|
# Remove any markdown json codeblock formatting if present (useful for gemma-2)
|
||||||
|
if response.startswith("```json"):
|
||||||
|
response = response[7:-3]
|
||||||
response = json.loads(response)
|
response = json.loads(response)
|
||||||
response = [q.strip() for q in response["source"] if q.strip()]
|
response = [q.strip() for q in response["source"] if q.strip()]
|
||||||
if not isinstance(response, list) or not response or len(response) == 0:
|
if not isinstance(response, list) or not response or len(response) == 0:
|
||||||
|
@ -401,6 +404,9 @@ async def generate_online_subqueries(
|
||||||
# Validate that the response is a non-empty, JSON-serializable list
|
# Validate that the response is a non-empty, JSON-serializable list
|
||||||
try:
|
try:
|
||||||
response = response.strip()
|
response = response.strip()
|
||||||
|
# Remove any markdown json codeblock formatting if present (useful for gemma-2)
|
||||||
|
if response.startswith("```json") and response.endswith("```"):
|
||||||
|
response = response[7:-3]
|
||||||
response = json.loads(response)
|
response = json.loads(response)
|
||||||
response = [q.strip() for q in response["queries"] if q.strip()]
|
response = [q.strip() for q in response["queries"] if q.strip()]
|
||||||
if not isinstance(response, list) or not response or len(response) == 0:
|
if not isinstance(response, list) or not response or len(response) == 0:
|
||||||
|
|
|
@ -70,7 +70,7 @@ class OfflineChatProcessorConfig:
|
||||||
|
|
||||||
|
|
||||||
class OfflineChatProcessorModel:
|
class OfflineChatProcessorModel:
|
||||||
def __init__(self, chat_model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", max_tokens: int = None):
|
def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None):
|
||||||
self.chat_model = chat_model
|
self.chat_model = chat_model
|
||||||
self.loaded_model = None
|
self.loaded_model = None
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -8,7 +8,7 @@ empty_escape_sequences = "\n|\r|\t| "
|
||||||
app_env_filepath = "~/.khoj/env"
|
app_env_filepath = "~/.khoj/env"
|
||||||
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
|
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
|
||||||
content_directory = "~/.khoj/content/"
|
content_directory = "~/.khoj/content/"
|
||||||
default_offline_chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
|
default_offline_chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
|
||||||
default_online_chat_model = "gpt-4-turbo-preview"
|
default_online_chat_model = "gpt-4-turbo-preview"
|
||||||
|
|
||||||
empty_config = {
|
empty_config = {
|
||||||
|
|
|
@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
|
||||||
|
|
||||||
|
|
||||||
class OfflineChatProcessorConfig(ConfigBase):
|
class OfflineChatProcessorConfig(ConfigBase):
|
||||||
chat_model: Optional[str] = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
|
chat_model: Optional[str] = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
|
||||||
|
|
||||||
|
|
||||||
class ConversationProcessorConfig(ConfigBase):
|
class ConversationProcessorConfig(ConfigBase):
|
||||||
|
|
|
@ -378,7 +378,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
|
||||||
|
|
||||||
# Initialize Processor from Config
|
# Initialize Processor from Config
|
||||||
ChatModelOptionsFactory(
|
ChatModelOptionsFactory(
|
||||||
chat_model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
|
chat_model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
|
||||||
tokenizer=None,
|
tokenizer=None,
|
||||||
max_prompt_size=None,
|
max_prompt_size=None,
|
||||||
model_type="offline",
|
model_type="offline",
|
||||||
|
|
|
@ -49,7 +49,7 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory):
|
||||||
|
|
||||||
max_prompt_size = 3500
|
max_prompt_size = 3500
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
|
chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
|
||||||
model_type = "offline"
|
model_type = "offline"
|
||||||
openai_config = factory.LazyAttribute(
|
openai_config = factory.LazyAttribute(
|
||||||
lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None
|
lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None
|
||||||
|
|
Loading…
Reference in a new issue