Upgrade to latest GPT4All. Use Mistral as default offline chat model

GPT4all now supports gguf llama.cpp chat models. Latest
GPT4All (+mistral) performs much at least 3x faster.

On Macbook Pro at ~10s response start time vs 30s-120s earlier.
Mistral is also a better chat model, although it hallucinates more
than llama-2
This commit is contained in:
Debanjum Singh Solanky 2023-10-22 18:16:02 -07:00
parent 6dc0df3afb
commit 0f1ebcae18
10 changed files with 84 additions and 11 deletions

View file

@ -60,8 +60,8 @@ dependencies = [
"bs4 >= 0.0.1",
"anyio == 3.7.1",
"pymupdf >= 1.23.3",
"gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
"gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
"gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
"gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'",
]
dynamic = ["version"]

View file

@ -0,0 +1,69 @@
"""
Current format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
offline-chat:
enable-offline-chat: false
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
...
search-type:
...
New format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
offline-chat:
enable-offline-chat: false
chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
...
search-type:
...
"""
import logging
from packaging import version
from khoj.utils.yaml import load_config_from_file, save_config_to_file
logger = logging.getLogger(__name__)
def migrate_offline_chat_default_model(args):
schema_version = "0.12.4"
raw_config = load_config_from_file(args.config_file)
previous_version = raw_config.get("version")
if "processor" not in raw_config:
return args
if raw_config["processor"] is None:
return args
if "conversation" not in raw_config["processor"]:
return args
if "offline-chat" not in raw_config["processor"]["conversation"]:
return args
if "chat-model" not in raw_config["processor"]["conversation"]["offline-chat"]:
return args
if previous_version is None or version.parse(previous_version) < version.parse("0.12.4"):
logger.info(
f"Upgrading config schema to {schema_version} from {previous_version} to change default (offline) chat model to mistral GGUF"
)
raw_config["version"] = schema_version
# Update offline chat model to mistral in GGUF format to use latest GPT4All
offline_chat_model = raw_config["processor"]["conversation"]["offline-chat"]["chat-model"]
if offline_chat_model.endswith(".bin"):
raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
save_config_to_file(raw_config, args.config_file)
return args

View file

@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
def extract_questions_offline(
text: str,
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
loaded_model: Union[Any, None] = None,
conversation_log={},
use_history: bool = True,
@ -123,7 +123,7 @@ def converse_offline(
references,
user_query,
conversation_log={},
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
loaded_model: Union[Any, None] = None,
completion_func=None,
conversation_command=ConversationCommand.Default,

View file

@ -14,9 +14,9 @@ def download_model(model_name: str):
# Use GPU for Chat Model, if available
try:
model = GPT4All(model_name=model_name, device="gpu")
logger.debug("Loaded chat model to GPU.")
logger.debug(f"Loaded {model_name} chat model to GPU.")
except ValueError:
model = GPT4All(model_name=model_name)
logger.debug("Loaded chat model to CPU.")
logger.debug(f"Loaded {model_name} chat model to CPU.")
return model

View file

@ -20,9 +20,11 @@ model_to_prompt_size = {
"gpt-4": 8192,
"llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
"gpt-3.5-turbo-16k": 15000,
"mistral-7b-instruct-v0.1.Q4_0.gguf": 1548,
}
model_to_tokenizer = {
"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
"mistral-7b-instruct-v0.1.Q4_0.gguf": "mistralai/Mistral-7B-Instruct-v0.1",
}

View file

@ -10,6 +10,7 @@ from khoj.migrations.migrate_version import migrate_config_to_version
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
from khoj.migrations.migrate_offline_model import migrate_offline_model
from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model
def cli(args=None):
@ -61,6 +62,7 @@ def run_migrations(args):
migrate_processor_conversation_schema,
migrate_offline_model,
migrate_offline_chat_schema,
migrate_offline_chat_default_model,
]
for migration in migrations:
args = migration(args)

View file

@ -55,7 +55,7 @@ empty_config = {
},
"offline-chat": {
"enable-offline-chat": False,
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
"chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf",
},
"tokenizer": None,
"max-prompt-size": None,
@ -132,7 +132,7 @@ default_config = {
},
"offline-chat": {
"enable-offline-chat": False,
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
"chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf",
},
"tokenizer": None,
"max-prompt-size": None,

View file

@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
class OfflineChatProcessorConfig(ConfigBase):
enable_offline_chat: Optional[bool] = False
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
chat_model: Optional[str] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
class ConversationProcessorConfig(ConfigBase):

View file

@ -206,7 +206,7 @@ def processor_config_offline_chat(tmp_path_factory):
# Setup conversation processor
processor_config = ProcessorConfig()
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True, chat_model="mistral-7b-instruct-v0.1.Q4_0.gguf")
processor_config.conversation = ConversationProcessorConfig(
offline_chat=offline_chat,
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),

View file

@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
from khoj.processor.conversation.utils import message_to_log
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
MODEL_NAME = "mistral-7b-instruct-v0.1.Q4_0.gguf"
@pytest.fixture(scope="session")