mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Upgrade to latest GPT4All. Use Mistral as default offline chat model
GPT4all now supports gguf llama.cpp chat models. Latest GPT4All (+mistral) performs much at least 3x faster. On Macbook Pro at ~10s response start time vs 30s-120s earlier. Mistral is also a better chat model, although it hallucinates more than llama-2
This commit is contained in:
parent
6dc0df3afb
commit
0f1ebcae18
10 changed files with 84 additions and 11 deletions
|
@ -60,8 +60,8 @@ dependencies = [
|
|||
"bs4 >= 0.0.1",
|
||||
"anyio == 3.7.1",
|
||||
"pymupdf >= 1.23.3",
|
||||
"gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
|
||||
"gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
|
||||
"gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
|
||||
"gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
|
|
69
src/khoj/migrations/migrate_offline_chat_default_model.py
Normal file
69
src/khoj/migrations/migrate_offline_chat_default_model.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
"""
|
||||
Current format of khoj.yml
|
||||
---
|
||||
app:
|
||||
...
|
||||
content-type:
|
||||
...
|
||||
processor:
|
||||
conversation:
|
||||
offline-chat:
|
||||
enable-offline-chat: false
|
||||
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
|
||||
...
|
||||
search-type:
|
||||
...
|
||||
|
||||
New format of khoj.yml
|
||||
---
|
||||
app:
|
||||
...
|
||||
content-type:
|
||||
...
|
||||
processor:
|
||||
conversation:
|
||||
offline-chat:
|
||||
enable-offline-chat: false
|
||||
chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
|
||||
...
|
||||
search-type:
|
||||
...
|
||||
"""
|
||||
import logging
|
||||
from packaging import version
|
||||
|
||||
from khoj.utils.yaml import load_config_from_file, save_config_to_file
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def migrate_offline_chat_default_model(args):
|
||||
schema_version = "0.12.4"
|
||||
raw_config = load_config_from_file(args.config_file)
|
||||
previous_version = raw_config.get("version")
|
||||
|
||||
if "processor" not in raw_config:
|
||||
return args
|
||||
if raw_config["processor"] is None:
|
||||
return args
|
||||
if "conversation" not in raw_config["processor"]:
|
||||
return args
|
||||
if "offline-chat" not in raw_config["processor"]["conversation"]:
|
||||
return args
|
||||
if "chat-model" not in raw_config["processor"]["conversation"]["offline-chat"]:
|
||||
return args
|
||||
|
||||
if previous_version is None or version.parse(previous_version) < version.parse("0.12.4"):
|
||||
logger.info(
|
||||
f"Upgrading config schema to {schema_version} from {previous_version} to change default (offline) chat model to mistral GGUF"
|
||||
)
|
||||
raw_config["version"] = schema_version
|
||||
|
||||
# Update offline chat model to mistral in GGUF format to use latest GPT4All
|
||||
offline_chat_model = raw_config["processor"]["conversation"]["offline-chat"]["chat-model"]
|
||||
if offline_chat_model.endswith(".bin"):
|
||||
raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
|
||||
|
||||
save_config_to_file(raw_config, args.config_file)
|
||||
return args
|
|
@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
def extract_questions_offline(
|
||||
text: str,
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||
model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
|
||||
loaded_model: Union[Any, None] = None,
|
||||
conversation_log={},
|
||||
use_history: bool = True,
|
||||
|
@ -123,7 +123,7 @@ def converse_offline(
|
|||
references,
|
||||
user_query,
|
||||
conversation_log={},
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||
model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
|
||||
loaded_model: Union[Any, None] = None,
|
||||
completion_func=None,
|
||||
conversation_command=ConversationCommand.Default,
|
||||
|
|
|
@ -14,9 +14,9 @@ def download_model(model_name: str):
|
|||
# Use GPU for Chat Model, if available
|
||||
try:
|
||||
model = GPT4All(model_name=model_name, device="gpu")
|
||||
logger.debug("Loaded chat model to GPU.")
|
||||
logger.debug(f"Loaded {model_name} chat model to GPU.")
|
||||
except ValueError:
|
||||
model = GPT4All(model_name=model_name)
|
||||
logger.debug("Loaded chat model to CPU.")
|
||||
logger.debug(f"Loaded {model_name} chat model to CPU.")
|
||||
|
||||
return model
|
||||
|
|
|
@ -20,9 +20,11 @@ model_to_prompt_size = {
|
|||
"gpt-4": 8192,
|
||||
"llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
|
||||
"gpt-3.5-turbo-16k": 15000,
|
||||
"mistral-7b-instruct-v0.1.Q4_0.gguf": 1548,
|
||||
}
|
||||
model_to_tokenizer = {
|
||||
"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
|
||||
"mistral-7b-instruct-v0.1.Q4_0.gguf": "mistralai/Mistral-7B-Instruct-v0.1",
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@ from khoj.migrations.migrate_version import migrate_config_to_version
|
|||
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
|
||||
from khoj.migrations.migrate_offline_model import migrate_offline_model
|
||||
from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
|
||||
from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model
|
||||
|
||||
|
||||
def cli(args=None):
|
||||
|
@ -61,6 +62,7 @@ def run_migrations(args):
|
|||
migrate_processor_conversation_schema,
|
||||
migrate_offline_model,
|
||||
migrate_offline_chat_schema,
|
||||
migrate_offline_chat_default_model,
|
||||
]
|
||||
for migration in migrations:
|
||||
args = migration(args)
|
||||
|
|
|
@ -55,7 +55,7 @@ empty_config = {
|
|||
},
|
||||
"offline-chat": {
|
||||
"enable-offline-chat": False,
|
||||
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||
"chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf",
|
||||
},
|
||||
"tokenizer": None,
|
||||
"max-prompt-size": None,
|
||||
|
@ -132,7 +132,7 @@ default_config = {
|
|||
},
|
||||
"offline-chat": {
|
||||
"enable-offline-chat": False,
|
||||
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||
"chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf",
|
||||
},
|
||||
"tokenizer": None,
|
||||
"max-prompt-size": None,
|
||||
|
|
|
@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
|
|||
|
||||
class OfflineChatProcessorConfig(ConfigBase):
|
||||
enable_offline_chat: Optional[bool] = False
|
||||
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
|
||||
chat_model: Optional[str] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
|
||||
|
||||
|
||||
class ConversationProcessorConfig(ConfigBase):
|
||||
|
|
|
@ -206,7 +206,7 @@ def processor_config_offline_chat(tmp_path_factory):
|
|||
|
||||
# Setup conversation processor
|
||||
processor_config = ProcessorConfig()
|
||||
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
|
||||
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True, chat_model="mistral-7b-instruct-v0.1.Q4_0.gguf")
|
||||
processor_config.conversation = ConversationProcessorConfig(
|
||||
offline_chat=offline_chat,
|
||||
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
||||
|
|
|
@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
|
|||
|
||||
from khoj.processor.conversation.utils import message_to_log
|
||||
|
||||
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
|
||||
MODEL_NAME = "mistral-7b-instruct-v0.1.Q4_0.gguf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
Loading…
Reference in a new issue