Merge branch 'master' of github.com:khoj-ai/khoj into features/multi-user-support-khoj

Merge changes to use latest GPT4All with GPU, GGUF model support into
khoj multi-user support rearchitecture branch
This commit is contained in:
Debanjum Singh Solanky 2023-11-02 22:44:25 -07:00
commit 345856e7be
16 changed files with 107 additions and 18 deletions

View file

@ -10,7 +10,8 @@
Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive. Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive.
> **System Requirements**: > **System Requirements**:
> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available > - Minimum 8 GB RAM. Recommend **16Gb VRAM**
> - Minimum **5 GB of Disk** available
> - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required > - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required
> - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times > - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times

View file

@ -62,8 +62,8 @@ dependencies = [
"pymupdf >= 1.23.5", "pymupdf >= 1.23.5",
"django == 4.2.5", "django == 4.2.5",
"authlib == 1.2.1", "authlib == 1.2.1",
"gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'", "gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
"gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'", "gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'",
"itsdangerous == 2.1.2", "itsdangerous == 2.1.2",
"httpx == 0.25.0", "httpx == 0.25.0",
"pgvector == 0.2.3", "pgvector == 0.2.3",

View file

@ -38,7 +38,7 @@ export class KhojChatModal extends Modal {
await this.getChatHistory(); await this.getChatHistory();
// Add chat input field // Add chat input field
contentEl.createEl("input", const chatInput = contentEl.createEl("input",
{ {
attr: { attr: {
type: "text", type: "text",
@ -48,10 +48,11 @@ export class KhojChatModal extends Modal {
class: "khoj-chat-input option" class: "khoj-chat-input option"
} }
}) })
.addEventListener('change', (event) => { this.result = (<HTMLInputElement>event.target).value }); chatInput.addEventListener('change', (event) => { this.result = (<HTMLInputElement>event.target).value });
// Scroll to bottom of modal, till the send message input box // Scroll to bottom of modal, till the send message input box
this.modalEl.scrollTop = this.modalEl.scrollHeight; this.modalEl.scrollTop = this.modalEl.scrollHeight;
chatInput.focus();
} }
generateReference(messageEl: any, reference: string, index: number) { generateReference(messageEl: any, reference: string, index: number) {

View file

@ -122,6 +122,7 @@ def set_state(args):
state.demo = args.demo state.demo = args.demo
state.anonymous_mode = args.anonymous_mode state.anonymous_mode = args.anonymous_mode
state.khoj_version = version("khoj-assistant") state.khoj_version = version("khoj-assistant")
state.chat_on_gpu = args.chat_on_gpu
def start_server(app, host=None, port=None, socket=None): def start_server(app, host=None, port=None, socket=None):

View file

@ -0,0 +1,69 @@
"""
Current format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
offline-chat:
enable-offline-chat: false
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
...
search-type:
...
New format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
offline-chat:
enable-offline-chat: false
chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
...
search-type:
...
"""
import logging
from packaging import version
from khoj.utils.yaml import load_config_from_file, save_config_to_file
logger = logging.getLogger(__name__)
def migrate_offline_chat_default_model(args):
schema_version = "0.12.4"
raw_config = load_config_from_file(args.config_file)
previous_version = raw_config.get("version")
if "processor" not in raw_config:
return args
if raw_config["processor"] is None:
return args
if "conversation" not in raw_config["processor"]:
return args
if "offline-chat" not in raw_config["processor"]["conversation"]:
return args
if "chat-model" not in raw_config["processor"]["conversation"]["offline-chat"]:
return args
if previous_version is None or version.parse(previous_version) < version.parse("0.12.4"):
logger.info(
f"Upgrading config schema to {schema_version} from {previous_version} to change default (offline) chat model to mistral GGUF"
)
raw_config["version"] = schema_version
# Update offline chat model to mistral in GGUF format to use latest GPT4All
offline_chat_model = raw_config["processor"]["conversation"]["offline-chat"]["chat-model"]
if offline_chat_model.endswith(".bin"):
raw_config["processor"]["conversation"]["offline-chat"]["chat-model"] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
save_config_to_file(raw_config, args.config_file)
return args

View file

@ -9,7 +9,7 @@ processor:
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
max-prompt-size: null max-prompt-size: null
offline-chat: offline-chat:
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
enable-offline-chat: false enable-offline-chat: false
openai: openai:
api-key: sk-blah api-key: sk-blah
@ -46,7 +46,7 @@ processor:
- chat-model: gpt-3.5-turbo - chat-model: gpt-3.5-turbo
tokenizer: null tokenizer: null
type: openai type: openai
- chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin - chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
tokenizer: null tokenizer: null
type: offline type: offline
search-type: search-type:

View file

@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
def extract_questions_offline( def extract_questions_offline(
text: str, text: str,
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
loaded_model: Union[Any, None] = None, loaded_model: Union[Any, None] = None,
conversation_log={}, conversation_log={},
use_history: bool = True, use_history: bool = True,
@ -123,7 +123,7 @@ def converse_offline(
references, references,
user_query, user_query,
conversation_log={}, conversation_log={},
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
loaded_model: Union[Any, None] = None, loaded_model: Union[Any, None] = None,
completion_func=None, completion_func=None,
conversation_command=ConversationCommand.Default, conversation_command=ConversationCommand.Default,

View file

@ -1,5 +1,6 @@
import logging import logging
from khoj.utils import state
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -16,8 +17,13 @@ def download_model(model_name: str):
# Decide whether to load model to GPU or CPU # Decide whether to load model to GPU or CPU
try: try:
# Check if machine has GPU and GPU has enough free memory to load the chat model # Try load chat model to GPU if:
device = "gpu" if gpt4all.pyllmodel.LLModel().list_gpu(chat_model_config["path"]) else "cpu" # 1. Loading chat model to GPU isn't disabled via CLI and
# 2. Machine has GPU
# 3. GPU has enough free memory to load the chat model
device = (
"gpu" if state.chat_on_gpu and gpt4all.pyllmodel.LLModel().list_gpu(chat_model_config["path"]) else "cpu"
)
except ValueError: except ValueError:
device = "cpu" device = "cpu"

View file

@ -20,9 +20,11 @@ model_to_prompt_size = {
"gpt-4": 8192, "gpt-4": 8192,
"llama-2-7b-chat.ggmlv3.q4_0.bin": 1548, "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
"gpt-3.5-turbo-16k": 15000, "gpt-3.5-turbo-16k": 15000,
"mistral-7b-instruct-v0.1.Q4_0.gguf": 1548,
} }
model_to_tokenizer = { model_to_tokenizer = {
"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer", "llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
"mistral-7b-instruct-v0.1.Q4_0.gguf": "mistralai/Mistral-7B-Instruct-v0.1",
} }

View file

@ -14,6 +14,7 @@ from khoj.migrations.migrate_version import migrate_config_to_version
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
from khoj.migrations.migrate_offline_model import migrate_offline_model from khoj.migrations.migrate_offline_model import migrate_offline_model
from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model
from khoj.migrations.migrate_server_pg import migrate_server_pg from khoj.migrations.migrate_server_pg import migrate_server_pg
@ -38,6 +39,9 @@ def cli(args=None):
help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock", help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock",
) )
parser.add_argument("--version", "-V", action="store_true", help="Print the installed Khoj version and exit") parser.add_argument("--version", "-V", action="store_true", help="Print the installed Khoj version and exit")
parser.add_argument(
"--disable-chat-on-gpu", action="store_true", default=False, help="Disable using GPU for the offline chat model"
)
parser.add_argument("--demo", action="store_true", default=False, help="Run Khoj in demo mode") parser.add_argument("--demo", action="store_true", default=False, help="Run Khoj in demo mode")
parser.add_argument( parser.add_argument(
"--anonymous-mode", "--anonymous-mode",
@ -50,6 +54,9 @@ def cli(args=None):
logger.debug(f"Ignoring unknown commandline args: {remaining_args}") logger.debug(f"Ignoring unknown commandline args: {remaining_args}")
# Set default values for arguments
args.chat_on_gpu = not args.disable_chat_on_gpu
args.version_no = version("khoj-assistant") args.version_no = version("khoj-assistant")
if args.version: if args.version:
# Show version of khoj installed and exit # Show version of khoj installed and exit
@ -76,6 +83,7 @@ def run_migrations(args):
migrate_processor_conversation_schema, migrate_processor_conversation_schema,
migrate_offline_model, migrate_offline_model,
migrate_offline_chat_schema, migrate_offline_chat_schema,
migrate_offline_chat_default_model,
migrate_server_pg, migrate_server_pg,
] ]
for migration in migrations: for migration in migrations:

View file

@ -84,7 +84,7 @@ class OpenAIProcessorConfig(ConfigBase):
class OfflineChatProcessorConfig(ConfigBase): class OfflineChatProcessorConfig(ConfigBase):
enable_offline_chat: Optional[bool] = False enable_offline_chat: Optional[bool] = False
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin" chat_model: Optional[str] = "mistral-7b-instruct-v0.1.Q4_0.gguf"
class ConversationProcessorConfig(ConfigBase): class ConversationProcessorConfig(ConfigBase):

View file

@ -33,5 +33,6 @@ SearchType = utils_config.SearchType
telemetry: List[Dict[str, str]] = [] telemetry: List[Dict[str, str]] = []
demo: bool = False demo: bool = False
khoj_version: str = None khoj_version: str = None
anonymous_mode: bool = False
device = get_device() device = get_device()
chat_on_gpu: bool = True
anonymous_mode: bool = False

View file

@ -169,7 +169,7 @@ def md_content_config():
return markdown_config return markdown_config
@pytest.fixture(scope="function") @pytest.fixture(scope="session")
def chat_client(search_config: SearchConfig, default_user2: KhojUser): def chat_client(search_config: SearchConfig, default_user2: KhojUser):
# Initialize app state # Initialize app state
state.config.search_type = search_config state.config.search_type = search_config
@ -211,7 +211,7 @@ def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUs
# Initialize Processor from Config # Initialize Processor from Config
if os.getenv("OPENAI_API_KEY"): if os.getenv("OPENAI_API_KEY"):
OpenAIProcessorConversationConfigFactory(user=default_user2) OpenAIProcessorConversationConfigFactory()
state.anonymous_mode = True state.anonymous_mode = True

View file

@ -14,4 +14,4 @@ search-type:
asymmetric: asymmetric:
cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2 cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
encoder: sentence-transformers/msmarco-MiniLM-L-6-v3 encoder: sentence-transformers/msmarco-MiniLM-L-6-v3
version: 0.10.1 version: 0.14.0

View file

@ -37,7 +37,7 @@ class ChatModelOptionsFactory(factory.django.DjangoModelFactory):
max_prompt_size = 2000 max_prompt_size = 2000
tokenizer = None tokenizer = None
chat_model = "llama-2-7b-chat.ggmlv3.q4_0.bin" chat_model = "mistral-7b-instruct-v0.1.Q4_0.gguf"
model_type = "offline" model_type = "offline"

View file

@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
from khoj.processor.conversation.utils import message_to_log from khoj.processor.conversation.utils import message_to_log
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin" MODEL_NAME = "mistral-7b-instruct-v0.1.Q4_0.gguf"
@pytest.fixture(scope="session") @pytest.fixture(scope="session")