Merge branch 'master' of github.com:khoj-ai/khoj into features/multi-user-support-khoj

Merge changes to use latest GPT4All with GPU, GGUF model support into
khoj multi-user support rearchitecture branch
This commit is contained in:
Debanjum Singh Solanky 2023-11-02 22:44:25 -07:00
commit 345856e7be
16 changed files with 107 additions and 18 deletions

View file

@ -10,7 +10,8 @@
Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive.
> **System Requirements**:
> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available
> - Minimum 8 GB RAM. Recommend **16Gb VRAM**
> - Minimum **5 GB of Disk** available
> - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required
> - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times

View file

@ -62,8 +62,8 @@ dependencies = [
"pymupdf >= 1.23.5",
"django == 4.2.5",
"authlib == 1.2.1",
"gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'",
"gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'",
"gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
"gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'",
"itsdangerous == 2.1.2",
"httpx == 0.25.0",
"pgvector == 0.2.3",

View file

@ -38,7 +38,7 @@ export class KhojChatModal extends Modal {
await this.getChatHistory();
// Add chat input field
contentEl.createEl("input",
const chatInput = contentEl.createEl("input",
{
attr: {
type: "text",
@ -48,10 +48,11 @@ export class KhojChatModal extends Modal {
class: "khoj-chat-input option"
}
})
.addEventListener('change', (event) => { this.result = (<HTMLInputElement>event.target).value });
chatInput.addEventListener('change', (event) => { this.result = (<HTMLInputElement>event.target).value });
// Scroll to bottom of modal, till the send message input box
this.modalEl.scrollTop = this.modalEl.scrollHeight;
chatInput.focus();
}
generateReference(messageEl: any, reference: string, index: number) {

View file

@ -122,6 +122,7 @@ def set_state(args):
state.demo = args.demo
state.anonymous_mode = args.anonymous_mode
state.khoj_version = version("khoj-assistant")
state.chat_on_gpu = args.chat_on_gpu
def start_server(app, host=None, port=None, socket=None):

View file

@ -0,0 +1,69 @@
"""
Current format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
offline-chat:
enable-offline-chat: false
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
...
search-type:
...
New format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
offline-chat:
enable-offline-chat: false
chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
...
search-type:
...
"""
import logging
from packaging import version
from khoj.utils.yaml import load_config_from_file, save_config_to_file
logger = logging.getLogger(__name__)
def migrate_offline_chat_default_model(args):
schema_version = "0.12.4"
raw_config = load_config_from_file(args.config_file)
previous_version = raw_config.get("version")
if "processor" not in raw_config:
return args
if raw_config["processor"] is None:
return args
if "conversation" not in raw_config["processor"]:
return args
if "offline-chat" not in raw_config["processor"]["conversation"]:
return args
if "chat-model" not in raw_config["processor"]["conversation"]["offline-chat"]:
return args
if previous_version is None or version.parse(previous_version) < version.parse("0.12.4"):
logger.info(
f"Upgrading config schema to {schema_version} from {previous_version} to change default (offline) chat model to mistral GGUF"
)
raw_config["version"] = schema_version
# Update offline chat model to mistral in GGUF format to use latest GPT4All
offline_chat_model = raw_config["processor"]["conversation"]["offline-chat"]["chat-model"]
if offline_chat_model.endswith(".bin"):
raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
save_config_to_file(raw_config, args.config_file)
return args

View file

@ -9,7 +9,7 @@ processor:
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
max-prompt-size: null
offline-chat:
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
enable-offline-chat: false
openai:
api-key: sk-blah
@ -46,7 +46,7 @@ processor:
- chat-model: gpt-3.5-turbo
tokenizer: null
type: openai
- chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
- chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
tokenizer: null
type: offline
search-type:

View file

@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
def extract_questions_offline(
text: str,
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
loaded_model: Union[Any, None] = None,
conversation_log={},
use_history: bool = True,
@ -123,7 +123,7 @@ def converse_offline(
references,
user_query,
conversation_log={},
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
loaded_model: Union[Any, None] = None,
completion_func=None,
conversation_command=ConversationCommand.Default,

View file

@ -1,5 +1,6 @@
import logging
from khoj.utils import state
logger = logging.getLogger(__name__)
@ -16,8 +17,13 @@ def download_model(model_name: str):
# Decide whether to load model to GPU or CPU
try:
# Check if machine has GPU and GPU has enough free memory to load the chat model
device = "gpu" if gpt4all.pyllmodel.LLModel().list_gpu(chat_model_config["path"]) else "cpu"
# Try load chat model to GPU if:
# 1. Loading chat model to GPU isn't disabled via CLI and
# 2. Machine has GPU
# 3. GPU has enough free memory to load the chat model
device = (
"gpu" if state.chat_on_gpu and gpt4all.pyllmodel.LLModel().list_gpu(chat_model_config["path"]) else "cpu"
)
except ValueError:
device = "cpu"

View file

@ -20,9 +20,11 @@ model_to_prompt_size = {
"gpt-4": 8192,
"llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
"gpt-3.5-turbo-16k": 15000,
"mistral-7b-instruct-v0.1.Q4_0.gguf": 1548,
}
model_to_tokenizer = {
"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
"mistral-7b-instruct-v0.1.Q4_0.gguf": "mistralai/Mistral-7B-Instruct-v0.1",
}

View file

@ -14,6 +14,7 @@ from khoj.migrations.migrate_version import migrate_config_to_version
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
from khoj.migrations.migrate_offline_model import migrate_offline_model
from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model
from khoj.migrations.migrate_server_pg import migrate_server_pg
@ -38,6 +39,9 @@ def cli(args=None):
help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock",
)
parser.add_argument("--version", "-V", action="store_true", help="Print the installed Khoj version and exit")
parser.add_argument(
"--disable-chat-on-gpu", action="store_true", default=False, help="Disable using GPU for the offline chat model"
)
parser.add_argument("--demo", action="store_true", default=False, help="Run Khoj in demo mode")
parser.add_argument(
"--anonymous-mode",
@ -50,6 +54,9 @@ def cli(args=None):
logger.debug(f"Ignoring unknown commandline args: {remaining_args}")
# Set default values for arguments
args.chat_on_gpu = not args.disable_chat_on_gpu
args.version_no = version("khoj-assistant")
if args.version:
# Show version of khoj installed and exit
@ -76,6 +83,7 @@ def run_migrations(args):
migrate_processor_conversation_schema,
migrate_offline_model,
migrate_offline_chat_schema,
migrate_offline_chat_default_model,
migrate_server_pg,
]
for migration in migrations:

View file

@ -84,7 +84,7 @@ class OpenAIProcessorConfig(ConfigBase):
class OfflineChatProcessorConfig(ConfigBase):
enable_offline_chat: Optional[bool] = False
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
chat_model: Optional[str] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
class ConversationProcessorConfig(ConfigBase):

View file

@ -33,5 +33,6 @@ SearchType = utils_config.SearchType
telemetry: List[Dict[str, str]] = []
demo: bool = False
khoj_version: str = None
anonymous_mode: bool = False
device = get_device()
chat_on_gpu: bool = True
anonymous_mode: bool = False

View file

@ -169,7 +169,7 @@ def md_content_config():
return markdown_config
@pytest.fixture(scope="function")
@pytest.fixture(scope="session")
def chat_client(search_config: SearchConfig, default_user2: KhojUser):
# Initialize app state
state.config.search_type = search_config
@ -211,7 +211,7 @@ def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUs
# Initialize Processor from Config
if os.getenv("OPENAI_API_KEY"):
OpenAIProcessorConversationConfigFactory(user=default_user2)
OpenAIProcessorConversationConfigFactory()
state.anonymous_mode = True

View file

@ -14,4 +14,4 @@ search-type:
asymmetric:
cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
encoder: sentence-transformers/msmarco-MiniLM-L-6-v3
version: 0.10.1
version: 0.14.0

View file

@ -37,7 +37,7 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory):
max_prompt_size = 2000
tokenizer = None
chat_model = "llama-2-7b-chat.ggmlv3.q4_0.bin"
chat_model = "mistral-7b-instruct-v0.1.Q4_0.gguf"
model_type = "offline"

View file

@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
from khoj.processor.conversation.utils import message_to_log
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
MODEL_NAME = "mistral-7b-instruct-v0.1.Q4_0.gguf"
@pytest.fixture(scope="session")