mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Improve docker builds for local hosting (#476)
* Remove GPT4All dependency in pyproject.toml and use multiplatform builds in the dockerization setup in GH actions * Move configure_search method into indexer * Add conditional installation for gpt4all * Add hint to go to localhost:42110 in the docs. Addresses #477
This commit is contained in:
parent
dccfae3853
commit
343854752c
10 changed files with 122 additions and 42 deletions
1
.github/workflows/dockerize.yml
vendored
1
.github/workflows/dockerize.yml
vendored
|
@ -41,6 +41,7 @@ jobs:
|
|||
with:
|
||||
context: .
|
||||
file: Dockerfile
|
||||
platforms: linux/amd64, linux/arm64
|
||||
push: true
|
||||
tags: ghcr.io/${{ github.repository }}:${{ env.DOCKER_IMAGE_TAG }}
|
||||
build-args: |
|
||||
|
|
|
@ -24,5 +24,10 @@ services:
|
|||
# You can set these volumes to point to empty directories on host
|
||||
- ./tests/data/embeddings/:/root/.khoj/content/
|
||||
- ./tests/data/models/:/root/.khoj/search/
|
||||
- khoj_config:/root/.khoj/
|
||||
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
|
||||
command: --host="0.0.0.0" --port=42110 -c=config/khoj_docker.yml -vv
|
||||
command: --host="0.0.0.0" --port=42110 -vv
|
||||
|
||||
|
||||
volumes:
|
||||
khoj_config:
|
||||
|
|
|
@ -8,6 +8,8 @@ These are the general setup instructions for Khoj.
|
|||
Its simpler as it can skip the *configure* step below.
|
||||
|
||||
### 1. Install
|
||||
|
||||
#### 1.1 Local Setup
|
||||
Run the following command in your terminal to install the Khoj backend.
|
||||
|
||||
- On Linux/MacOS
|
||||
|
@ -22,7 +24,7 @@ Run the following command in your terminal to install the Khoj backend.
|
|||
For more detailed Windows installation and troubleshooting, see [Windows Install](./windows_install.md).
|
||||
|
||||
|
||||
### 2. Start
|
||||
##### 1.1.1 Start
|
||||
|
||||
Run the following command from your terminal to start the Khoj backend and open Khoj in your browser.
|
||||
|
||||
|
@ -30,16 +32,27 @@ Run the following command from your terminal to start the Khoj backend and open
|
|||
khoj
|
||||
```
|
||||
|
||||
Khoj should now be running at http://localhost:42110. You can see the web UI in your browser.
|
||||
|
||||
Note: To start Khoj automatically in the background use [Task scheduler](https://www.windowscentral.com/how-create-automated-task-using-task-scheduler-windows-10) on Windows or [Cron](https://en.wikipedia.org/wiki/Cron) on Mac, Linux (e.g with `@reboot khoj`)
|
||||
|
||||
### 3. Configure
|
||||
#### 1.2 Docker Setup
|
||||
Use the sample docker-compose [in Github](https://github.com/khoj-ai/khoj/blob/master/docker-compose.yml) to run Khoj in Docker. To start the container, run the following command in the same directory as the docker-compose.yml file. You'll have to configure the mounted directories to match your local knowledge base.
|
||||
|
||||
```shell
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
Khoj should now be running at http://localhost:42110. You can see the web UI in your browser.
|
||||
|
||||
### 2. Configure
|
||||
1. Set `File`, `Folder` and hit `Save` in each Plugins you want to enable for Search on the Khoj config page
|
||||
2. Add your OpenAI API key to Chat Feature settings if you want to use Chat
|
||||
3. Click `Configure` and wait. The app will download ML models and index the content for search and (optionally) chat
|
||||
|
||||
![configure demo](https://user-images.githubusercontent.com/6413477/255307879-61247d3f-c69a-46ef-b058-9bc533cb5c72.mp4 ':include :type=mp4')
|
||||
|
||||
### 4. Install Interface Plugins (Optional)
|
||||
### 3. Install Interface Plugins (Optional)
|
||||
Khoj exposes a web interface to search, chat and configure by default.<br />
|
||||
The optional steps below allow using Khoj from within an existing application like Obsidian or Emacs.
|
||||
|
||||
|
|
|
@ -57,9 +57,10 @@ dependencies = [
|
|||
"langchain >= 0.0.187",
|
||||
"requests >= 2.26.0",
|
||||
"bs4 >= 0.0.1",
|
||||
"gpt4all >= 1.0.7",
|
||||
"anyio == 3.7.1",
|
||||
"pymupdf >= 1.23.3",
|
||||
"gpt4all == 0.1.9; platform_system == 'Linux' and platform_machine == 'x86_64'",
|
||||
"gpt4all == 0.1.9; platform_system == 'Windows' or platform_system == 'Darwin'",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
|
|
|
@ -11,18 +11,16 @@ import schedule
|
|||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
# Internal Packages
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils import constants, state
|
||||
from khoj.utils.config import (
|
||||
SearchType,
|
||||
SearchModels,
|
||||
ProcessorConfigModel,
|
||||
ConversationProcessorConfigModel,
|
||||
)
|
||||
from khoj.utils.helpers import resolve_absolute_path, merge_dicts
|
||||
from khoj.utils.fs_syncer import collect_files
|
||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, SearchConfig, ConversationProcessorConfig
|
||||
from khoj.routers.indexer import configure_content, load_content
|
||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig
|
||||
from khoj.routers.indexer import configure_content, load_content, configure_search
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -136,26 +134,6 @@ def configure_search_types(config: FullConfig):
|
|||
return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types))
|
||||
|
||||
|
||||
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
|
||||
# Run Validation Checks
|
||||
if search_config is None:
|
||||
logger.warning("🚨 No Search configuration available.")
|
||||
return None
|
||||
if search_models is None:
|
||||
search_models = SearchModels()
|
||||
|
||||
# Initialize Search Models
|
||||
if search_config.asymmetric:
|
||||
logger.info("🔍 📜 Setting up text search model")
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
|
||||
if search_config.image:
|
||||
logger.info("🔍 🌄 Setting up image search model")
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
|
||||
return search_models
|
||||
|
||||
|
||||
def configure_processor(
|
||||
processor_config: Optional[ProcessorConfig], state_processor_config: Optional[ProcessorConfigModel] = None
|
||||
):
|
||||
|
|
|
@ -100,3 +100,7 @@ def poll_task_scheduler():
|
|||
timer_thread.daemon = True
|
||||
timer_thread.start()
|
||||
schedule.run_pending()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
from typing import Iterator, Union, List
|
||||
from typing import Iterator, Union, List, Any
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from threading import Thread
|
||||
|
||||
from langchain.schema import ChatMessage
|
||||
|
||||
from gpt4all import GPT4All
|
||||
|
||||
from khoj.processor.conversation.utils import ThreadedGenerator, generate_chatml_messages_with_context
|
||||
from khoj.processor.conversation import prompts
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
|
@ -19,7 +17,7 @@ logger = logging.getLogger(__name__)
|
|||
def extract_questions_offline(
|
||||
text: str,
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
|
||||
loaded_model: Union[GPT4All, None] = None,
|
||||
loaded_model: Union[Any, None] = None,
|
||||
conversation_log={},
|
||||
use_history: bool = True,
|
||||
should_extract_questions: bool = True,
|
||||
|
@ -27,6 +25,15 @@ def extract_questions_offline(
|
|||
"""
|
||||
Infer search queries to retrieve relevant notes to answer user query
|
||||
"""
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
# Assert that loaded_model is either None or of type GPT4All
|
||||
assert loaded_model is None or isinstance(loaded_model, GPT4All), "loaded_model must be of type GPT4All or None"
|
||||
|
||||
all_questions = text.split("? ")
|
||||
all_questions = [q + "?" for q in all_questions[:-1]] + [all_questions[-1]]
|
||||
|
||||
|
@ -117,13 +124,20 @@ def converse_offline(
|
|||
user_query,
|
||||
conversation_log={},
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
|
||||
loaded_model: Union[GPT4All, None] = None,
|
||||
loaded_model: Union[Any, None] = None,
|
||||
completion_func=None,
|
||||
conversation_command=ConversationCommand.Default,
|
||||
) -> Union[ThreadedGenerator, Iterator[str]]:
|
||||
"""
|
||||
Converse with user using Llama
|
||||
"""
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
assert loaded_model is None or isinstance(loaded_model, GPT4All), "loaded_model must be of type GPT4All or None"
|
||||
gpt4all_model = loaded_model or GPT4All(model)
|
||||
# Initialize Variables
|
||||
compiled_references_message = "\n\n".join({f"{item}" for item in references})
|
||||
|
@ -152,7 +166,14 @@ def converse_offline(
|
|||
return g
|
||||
|
||||
|
||||
def llm_thread(g, messages: List[ChatMessage], model: GPT4All):
|
||||
def llm_thread(g, messages: List[ChatMessage], model: Any):
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
assert isinstance(model, GPT4All), "model should be of type GPT4All"
|
||||
user_message = messages[-1]
|
||||
system_message = messages[0]
|
||||
conversation_history = messages[1:-1]
|
||||
|
|
|
@ -3,7 +3,6 @@ import logging
|
|||
import requests
|
||||
import hashlib
|
||||
|
||||
from gpt4all import GPT4All
|
||||
from tqdm import tqdm
|
||||
|
||||
from khoj.processor.conversation.gpt4all import model_metadata
|
||||
|
@ -22,6 +21,12 @@ def get_md5_checksum(filename: str):
|
|||
|
||||
|
||||
def download_model(model_name: str):
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
url = model_metadata.model_name_to_url.get(model_name)
|
||||
if not url:
|
||||
logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# Standard Packages
|
||||
import logging
|
||||
import sys
|
||||
import json
|
||||
from typing import Optional, Union, Dict
|
||||
|
||||
# External Packages
|
||||
|
@ -8,7 +9,7 @@ from fastapi import APIRouter, HTTPException, Header, Request, Body, Response
|
|||
from pydantic import BaseModel
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import state
|
||||
from khoj.utils import state, constants
|
||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
|
@ -18,11 +19,14 @@ from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
|
|||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
from khoj.utils.rawconfig import ContentConfig, TextContentConfig
|
||||
from khoj.search_type import text_search, image_search
|
||||
from khoj.utils.yaml import save_config_to_file_updated_state
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.constants import default_config
|
||||
from khoj.utils.helpers import LRU, get_file_type
|
||||
from khoj.utils.rawconfig import (
|
||||
ContentConfig,
|
||||
FullConfig,
|
||||
SearchConfig,
|
||||
)
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
from khoj.search_filter.word_filter import WordFilter
|
||||
|
@ -111,6 +115,28 @@ async def index_batch(
|
|||
plaintext=plaintext_files,
|
||||
)
|
||||
|
||||
if state.config == None:
|
||||
logger.info("First run, initializing state.")
|
||||
default_full_config = FullConfig(
|
||||
content_type=None,
|
||||
search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
|
||||
processor=None,
|
||||
)
|
||||
state.config = default_full_config
|
||||
default_content_config = ContentConfig(
|
||||
org=None,
|
||||
markdown=None,
|
||||
pdf=None,
|
||||
image=None,
|
||||
github=None,
|
||||
notion=None,
|
||||
plaintext=None,
|
||||
plugins=None,
|
||||
)
|
||||
state.config.content_type = default_content_config
|
||||
save_config_to_file_updated_state()
|
||||
configure_search(state.search_models, state.config.search_type)
|
||||
|
||||
# Extract required fields from config
|
||||
state.content_index = configure_content(
|
||||
state.content_index,
|
||||
|
@ -129,6 +155,26 @@ async def index_batch(
|
|||
return Response(content="OK", status_code=200)
|
||||
|
||||
|
||||
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
|
||||
# Run Validation Checks
|
||||
if search_config is None:
|
||||
logger.warning("🚨 No Search configuration available.")
|
||||
return None
|
||||
if search_models is None:
|
||||
search_models = SearchModels()
|
||||
|
||||
# Initialize Search Models
|
||||
if search_config.asymmetric:
|
||||
logger.info("🔍 📜 Setting up text search model")
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
|
||||
if search_config.image:
|
||||
logger.info("🔍 🌄 Setting up image search model")
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
|
||||
return search_models
|
||||
|
||||
|
||||
def configure_content(
|
||||
content_index: Optional[ContentIndex],
|
||||
content_config: Optional[ContentConfig],
|
||||
|
@ -138,6 +184,9 @@ def configure_content(
|
|||
t: Optional[Union[state.SearchType, str]] = None,
|
||||
full_corpus: bool = True,
|
||||
) -> Optional[ContentIndex]:
|
||||
def has_valid_text_config(config: TextContentConfig):
|
||||
return config.input_files or config.input_filter
|
||||
|
||||
# Run Validation Checks
|
||||
if content_config is None:
|
||||
logger.warning("🚨 No Content configuration available.")
|
||||
|
@ -158,7 +207,7 @@ def configure_content(
|
|||
# Initialize Org Notes Search
|
||||
if (
|
||||
(t == None or t == state.SearchType.Org.value)
|
||||
and (content_config.org or files["org"])
|
||||
and ((content_config.org and has_valid_text_config(content_config.org)) or files["org"])
|
||||
and search_models.text_search
|
||||
):
|
||||
if content_config.org == None:
|
||||
|
@ -187,7 +236,7 @@ def configure_content(
|
|||
# Initialize Markdown Search
|
||||
if (
|
||||
(t == None or t == state.SearchType.Markdown.value)
|
||||
and (content_config.markdown or files["markdown"])
|
||||
and ((content_config.markdown and has_valid_text_config(content_config.markdown)) or files["markdown"])
|
||||
and search_models.text_search
|
||||
and files["markdown"]
|
||||
):
|
||||
|
@ -218,7 +267,7 @@ def configure_content(
|
|||
# Initialize PDF Search
|
||||
if (
|
||||
(t == None or t == state.SearchType.Pdf.value)
|
||||
and (content_config.pdf or files["pdf"])
|
||||
and ((content_config.pdf and has_valid_text_config(content_config.pdf)) or files["pdf"])
|
||||
and search_models.text_search
|
||||
and files["pdf"]
|
||||
):
|
||||
|
@ -249,7 +298,7 @@ def configure_content(
|
|||
# Initialize Plaintext Search
|
||||
if (
|
||||
(t == None or t == state.SearchType.Plaintext.value)
|
||||
and (content_config.plaintext or files["plaintext"])
|
||||
and ((content_config.plaintext and has_valid_text_config(content_config.plaintext)) or files["plaintext"])
|
||||
and search_models.text_search
|
||||
and files["plaintext"]
|
||||
):
|
||||
|
|
|
@ -13,7 +13,10 @@ pytestmark = pytest.mark.skipif(
|
|||
import freezegun
|
||||
from freezegun import freeze_time
|
||||
|
||||
from gpt4all import GPT4All
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError as e:
|
||||
print("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.conversation.gpt4all.chat_model import converse_offline, extract_questions_offline, filter_questions
|
||||
|
|
Loading…
Reference in a new issue