mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
b45e1d8c0d
* Store conversation command options in an Enum * Move to slash commands instead of using @ to specify general commands * Calculate conversation command once & pass it as arg to child funcs * Add /notes command to respond using only knowledge base as context This prevents the chat model to try respond using it's general world knowledge only without any references pulled from the indexed knowledge base * Test general and notes slash commands in openai chat director tests --------- Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
286 lines
9.7 KiB
Python
286 lines
9.7 KiB
Python
# External Packages
|
|
import os
|
|
from copy import deepcopy
|
|
from fastapi.testclient import TestClient
|
|
from pathlib import Path
|
|
import pytest
|
|
|
|
# Internal Packages
|
|
from khoj.main import app
|
|
from khoj.configure import configure_processor, configure_routes, configure_search_types
|
|
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
|
from khoj.search_type import image_search, text_search
|
|
from khoj.utils.config import SearchModels
|
|
from khoj.utils.helpers import resolve_absolute_path
|
|
from khoj.utils.rawconfig import (
|
|
ContentConfig,
|
|
ConversationProcessorConfig,
|
|
OpenAIProcessorConfig,
|
|
ProcessorConfig,
|
|
TextContentConfig,
|
|
GithubContentConfig,
|
|
GithubRepoConfig,
|
|
ImageContentConfig,
|
|
SearchConfig,
|
|
TextSearchConfig,
|
|
ImageSearchConfig,
|
|
)
|
|
from khoj.utils import state
|
|
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
|
from khoj.search_filter.date_filter import DateFilter
|
|
from khoj.search_filter.word_filter import WordFilter
|
|
from khoj.search_filter.file_filter import FileFilter
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def search_config() -> SearchConfig:
|
|
model_dir = resolve_absolute_path("~/.khoj/search")
|
|
model_dir.mkdir(parents=True, exist_ok=True)
|
|
search_config = SearchConfig()
|
|
|
|
search_config.symmetric = TextSearchConfig(
|
|
encoder="sentence-transformers/all-MiniLM-L6-v2",
|
|
cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
model_directory=model_dir / "symmetric/",
|
|
encoder_type=None,
|
|
)
|
|
|
|
search_config.asymmetric = TextSearchConfig(
|
|
encoder="sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
|
cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
model_directory=model_dir / "asymmetric/",
|
|
encoder_type=None,
|
|
)
|
|
|
|
search_config.image = ImageSearchConfig(
|
|
encoder="sentence-transformers/clip-ViT-B-32",
|
|
model_directory=model_dir / "image/",
|
|
encoder_type=None,
|
|
)
|
|
|
|
return search_config
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def search_models(search_config: SearchConfig):
|
|
search_models = SearchModels()
|
|
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
|
search_models.image_search = image_search.initialize_model(search_config.image)
|
|
|
|
return search_models
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def content_config(tmp_path_factory, search_models: SearchModels, search_config: SearchConfig):
|
|
content_dir = tmp_path_factory.mktemp("content")
|
|
|
|
# Generate Image Embeddings from Test Images
|
|
content_config = ContentConfig()
|
|
content_config.image = ImageContentConfig(
|
|
input_filter=None,
|
|
input_directories=["tests/data/images"],
|
|
embeddings_file=content_dir.joinpath("image_embeddings.pt"),
|
|
batch_size=1,
|
|
use_xmp_metadata=False,
|
|
)
|
|
|
|
image_search.setup(content_config.image, search_models.image_search.image_encoder, regenerate=False)
|
|
|
|
# Generate Notes Embeddings from Test Notes
|
|
content_config.org = TextContentConfig(
|
|
input_files=None,
|
|
input_filter=["tests/data/org/*.org"],
|
|
compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
|
|
embeddings_file=content_dir.joinpath("note_embeddings.pt"),
|
|
)
|
|
|
|
filters = [DateFilter(), WordFilter(), FileFilter()]
|
|
text_search.setup(
|
|
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
|
|
)
|
|
|
|
content_config.plugins = {
|
|
"plugin1": TextContentConfig(
|
|
input_files=[content_dir.joinpath("notes.jsonl.gz")],
|
|
input_filter=None,
|
|
compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
|
|
embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
|
|
)
|
|
}
|
|
|
|
content_config.plaintext = TextContentConfig(
|
|
input_files=None,
|
|
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
|
|
compressed_jsonl=content_dir.joinpath("plaintext.jsonl.gz"),
|
|
embeddings_file=content_dir.joinpath("plaintext_embeddings.pt"),
|
|
)
|
|
|
|
content_config.github = GithubContentConfig(
|
|
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
|
|
repos=[
|
|
GithubRepoConfig(
|
|
owner="khoj-ai",
|
|
name="lantern",
|
|
branch="master",
|
|
)
|
|
],
|
|
compressed_jsonl=content_dir.joinpath("github.jsonl.gz"),
|
|
embeddings_file=content_dir.joinpath("github_embeddings.pt"),
|
|
)
|
|
|
|
filters = [DateFilter(), WordFilter(), FileFilter()]
|
|
text_search.setup(
|
|
JsonlToJsonl,
|
|
content_config.plugins["plugin1"],
|
|
search_models.text_search.bi_encoder,
|
|
regenerate=False,
|
|
filters=filters,
|
|
)
|
|
|
|
return content_config
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def md_content_config(tmp_path_factory):
|
|
content_dir = tmp_path_factory.mktemp("content")
|
|
|
|
# Generate Embeddings for Markdown Content
|
|
content_config = ContentConfig()
|
|
content_config.markdown = TextContentConfig(
|
|
input_files=None,
|
|
input_filter=["tests/data/markdown/*.markdown"],
|
|
compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"),
|
|
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
|
|
)
|
|
|
|
return content_config
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def processor_config(tmp_path_factory):
|
|
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
processor_dir = tmp_path_factory.mktemp("processor")
|
|
|
|
# The conversation processor is the only configured processor
|
|
# It needs an OpenAI API key to work.
|
|
if not openai_api_key:
|
|
return
|
|
|
|
# Setup conversation processor, if OpenAI API key is set
|
|
processor_config = ProcessorConfig()
|
|
processor_config.conversation = ConversationProcessorConfig(
|
|
openai=OpenAIProcessorConfig(api_key=openai_api_key),
|
|
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
|
)
|
|
|
|
return processor_config
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def processor_config_offline_chat(tmp_path_factory):
|
|
processor_dir = tmp_path_factory.mktemp("processor")
|
|
|
|
# Setup conversation processor
|
|
processor_config = ProcessorConfig()
|
|
processor_config.conversation = ConversationProcessorConfig(
|
|
enable_offline_chat=True,
|
|
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
|
)
|
|
|
|
return processor_config
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig):
|
|
# Initialize app state
|
|
state.config.content_type = md_content_config
|
|
state.config.search_type = search_config
|
|
state.SearchType = configure_search_types(state.config)
|
|
|
|
# Index Markdown Content for Search
|
|
filters = [DateFilter(), WordFilter(), FileFilter()]
|
|
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
|
state.content_index.markdown = text_search.setup(
|
|
MarkdownToJsonl,
|
|
md_content_config.markdown,
|
|
state.search_models.text_search.bi_encoder,
|
|
regenerate=False,
|
|
filters=filters,
|
|
)
|
|
|
|
# Initialize Processor from Config
|
|
state.processor_config = configure_processor(processor_config)
|
|
|
|
configure_routes(app)
|
|
return TestClient(app)
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def client(content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig):
|
|
state.config.content_type = content_config
|
|
state.config.search_type = search_config
|
|
state.SearchType = configure_search_types(state.config)
|
|
|
|
# These lines help us Mock the Search models for these search types
|
|
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
|
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
|
state.content_index.org = text_search.setup(
|
|
OrgToJsonl, content_config.org, state.search_models.text_search.bi_encoder, regenerate=False
|
|
)
|
|
state.content_index.image = image_search.setup(
|
|
content_config.image, state.search_models.image_search, regenerate=False
|
|
)
|
|
|
|
state.processor_config = configure_processor(processor_config)
|
|
|
|
configure_routes(app)
|
|
return TestClient(app)
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def client_offline_chat(
|
|
md_content_config: ContentConfig, search_config: SearchConfig, processor_config_offline_chat: ProcessorConfig
|
|
):
|
|
# Initialize app state
|
|
state.config.content_type = md_content_config
|
|
state.config.search_type = search_config
|
|
state.SearchType = configure_search_types(state.config)
|
|
|
|
# Index Markdown Content for Search
|
|
filters = [DateFilter(), WordFilter(), FileFilter()]
|
|
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
|
state.content_index.markdown = text_search.setup(
|
|
MarkdownToJsonl,
|
|
md_content_config.markdown,
|
|
state.search_models.text_search.bi_encoder,
|
|
regenerate=False,
|
|
filters=filters,
|
|
)
|
|
|
|
# Initialize Processor from Config
|
|
state.processor_config = configure_processor(processor_config_offline_chat)
|
|
|
|
configure_routes(app)
|
|
return TestClient(app)
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def new_org_file(content_config: ContentConfig):
|
|
# Setup
|
|
new_org_file = Path(content_config.org.input_filter[0]).parent / "new_file.org"
|
|
new_org_file.touch()
|
|
|
|
yield new_org_file
|
|
|
|
# Cleanup
|
|
if new_org_file.exists():
|
|
new_org_file.unlink()
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def org_config_with_only_new_file(content_config: ContentConfig, new_org_file: Path):
|
|
new_org_config = deepcopy(content_config.org)
|
|
new_org_config.input_files = [f"{new_org_file}"]
|
|
new_org_config.input_filter = None
|
|
return new_org_config
|