2022-09-05 00:05:13 +02:00
|
|
|
# External Packages
|
2023-03-15 21:26:19 +01:00
|
|
|
import os
|
2023-03-01 02:26:06 +01:00
|
|
|
from fastapi.testclient import TestClient
|
2023-01-09 20:17:36 +01:00
|
|
|
from pathlib import Path
|
2021-10-03 04:46:29 +02:00
|
|
|
import pytest
|
2023-10-15 04:39:13 +02:00
|
|
|
from fastapi.staticfiles import StaticFiles
|
2023-10-26 18:42:29 +02:00
|
|
|
from fastapi import FastAPI
|
|
|
|
import factory
|
|
|
|
import os
|
|
|
|
from fastapi import FastAPI
|
|
|
|
|
|
|
|
app = FastAPI()
|
|
|
|
|
2021-10-03 04:46:29 +02:00
|
|
|
|
|
|
|
# Internal Packages
|
2023-10-15 04:39:13 +02:00
|
|
|
from khoj.configure import configure_processor, configure_routes, configure_search_types, configure_middleware
|
2023-08-31 21:55:17 +02:00
|
|
|
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
2023-02-14 21:50:51 +01:00
|
|
|
from khoj.search_type import image_search, text_search
|
2023-07-22 09:28:14 +02:00
|
|
|
from khoj.utils.config import SearchModels
|
2023-10-15 04:39:13 +02:00
|
|
|
from khoj.utils.constants import web_directory
|
2023-02-14 21:50:51 +01:00
|
|
|
from khoj.utils.helpers import resolve_absolute_path
|
2023-02-17 17:04:26 +01:00
|
|
|
from khoj.utils.rawconfig import (
|
|
|
|
ContentConfig,
|
2023-03-15 21:26:19 +01:00
|
|
|
ConversationProcessorConfig,
|
2023-10-15 23:19:29 +02:00
|
|
|
OfflineChatProcessorConfig,
|
2023-07-27 20:27:32 +02:00
|
|
|
OpenAIProcessorConfig,
|
2023-03-15 21:26:19 +01:00
|
|
|
ProcessorConfig,
|
2023-02-17 17:04:26 +01:00
|
|
|
TextContentConfig,
|
|
|
|
ImageContentConfig,
|
|
|
|
SearchConfig,
|
|
|
|
TextSearchConfig,
|
|
|
|
ImageSearchConfig,
|
|
|
|
)
|
2023-09-18 23:41:26 +02:00
|
|
|
from khoj.utils import state, fs_syncer
|
|
|
|
from khoj.routers.indexer import configure_content
|
2023-02-14 21:50:51 +01:00
|
|
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
2023-10-26 18:42:29 +02:00
|
|
|
from database.models import (
|
|
|
|
LocalOrgConfig,
|
|
|
|
LocalMarkdownConfig,
|
|
|
|
LocalPlaintextConfig,
|
|
|
|
LocalPdfConfig,
|
|
|
|
GithubConfig,
|
|
|
|
KhojUser,
|
|
|
|
GithubRepoConfig,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
|
|
def enable_db_access_for_all_tests(db):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class UserFactory(factory.django.DjangoModelFactory):
|
|
|
|
class Meta:
|
|
|
|
model = KhojUser
|
|
|
|
|
|
|
|
username = factory.Faker("name")
|
|
|
|
email = factory.Faker("email")
|
|
|
|
password = factory.Faker("password")
|
|
|
|
uuid = factory.Faker("uuid4")
|
2021-10-03 04:46:29 +02:00
|
|
|
|
|
|
|
|
2023-02-17 17:04:26 +01:00
|
|
|
@pytest.fixture(scope="session")
|
2022-09-10 13:15:43 +02:00
|
|
|
def search_config() -> SearchConfig:
|
2023-02-17 17:04:26 +01:00
|
|
|
model_dir = resolve_absolute_path("~/.khoj/search")
|
2022-09-10 13:15:43 +02:00
|
|
|
model_dir.mkdir(parents=True, exist_ok=True)
|
2022-01-15 02:54:38 +01:00
|
|
|
search_config = SearchConfig()
|
2022-01-15 02:13:14 +01:00
|
|
|
|
2022-07-21 16:05:43 +02:00
|
|
|
search_config.symmetric = TextSearchConfig(
|
2023-02-17 17:04:26 +01:00
|
|
|
encoder="sentence-transformers/all-MiniLM-L6-v2",
|
|
|
|
cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
|
|
model_directory=model_dir / "symmetric/",
|
2023-07-14 10:19:38 +02:00
|
|
|
encoder_type=None,
|
2022-01-15 02:13:14 +01:00
|
|
|
)
|
|
|
|
|
2022-07-21 16:05:43 +02:00
|
|
|
search_config.asymmetric = TextSearchConfig(
|
2023-02-17 17:04:26 +01:00
|
|
|
encoder="sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
|
|
|
cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
|
|
model_directory=model_dir / "asymmetric/",
|
2023-07-14 10:19:38 +02:00
|
|
|
encoder_type=None,
|
2022-01-15 02:13:14 +01:00
|
|
|
)
|
|
|
|
|
2022-01-15 02:54:38 +01:00
|
|
|
search_config.image = ImageSearchConfig(
|
2023-07-14 10:19:38 +02:00
|
|
|
encoder="sentence-transformers/clip-ViT-B-32",
|
|
|
|
model_directory=model_dir / "image/",
|
|
|
|
encoder_type=None,
|
2022-01-15 02:13:14 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
return search_config
|
|
|
|
|
|
|
|
|
2023-10-26 18:42:29 +02:00
|
|
|
@pytest.mark.django_db
|
|
|
|
@pytest.fixture
|
|
|
|
def default_user():
|
|
|
|
return UserFactory()
|
|
|
|
|
|
|
|
|
2023-02-17 17:04:26 +01:00
|
|
|
@pytest.fixture(scope="session")
|
2023-07-14 10:19:38 +02:00
|
|
|
def search_models(search_config: SearchConfig):
|
|
|
|
search_models = SearchModels()
|
|
|
|
search_models.image_search = image_search.initialize_model(search_config.image)
|
|
|
|
|
|
|
|
return search_models
|
|
|
|
|
|
|
|
|
2023-10-26 18:42:29 +02:00
|
|
|
@pytest.fixture
|
|
|
|
def anyio_backend():
|
|
|
|
return "asyncio"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.django_db
|
|
|
|
@pytest.fixture(scope="function")
|
|
|
|
def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser):
|
2023-02-17 17:04:26 +01:00
|
|
|
content_dir = tmp_path_factory.mktemp("content")
|
2022-01-15 02:13:14 +01:00
|
|
|
|
2021-10-03 04:46:29 +02:00
|
|
|
# Generate Image Embeddings from Test Images
|
2022-08-20 13:21:04 +02:00
|
|
|
content_config = ContentConfig()
|
|
|
|
content_config.image = ImageContentConfig(
|
2023-07-14 10:19:38 +02:00
|
|
|
input_filter=None,
|
2023-02-17 17:04:26 +01:00
|
|
|
input_directories=["tests/data/images"],
|
|
|
|
embeddings_file=content_dir.joinpath("image_embeddings.pt"),
|
|
|
|
batch_size=1,
|
|
|
|
use_xmp_metadata=False,
|
|
|
|
)
|
2021-10-03 04:46:29 +02:00
|
|
|
|
2023-07-14 10:19:38 +02:00
|
|
|
image_search.setup(content_config.image, search_models.image_search.image_encoder, regenerate=False)
|
2021-10-03 04:46:29 +02:00
|
|
|
|
2023-10-26 18:42:29 +02:00
|
|
|
LocalOrgConfig.objects.create(
|
2023-02-17 17:04:26 +01:00
|
|
|
input_files=None,
|
|
|
|
input_filter=["tests/data/org/*.org"],
|
2023-10-26 18:42:29 +02:00
|
|
|
index_heading_entries=False,
|
|
|
|
user=default_user,
|
2023-02-17 17:04:26 +01:00
|
|
|
)
|
2021-10-03 04:46:29 +02:00
|
|
|
|
2023-10-26 18:42:29 +02:00
|
|
|
text_search.setup(OrgToJsonl, get_sample_data("org"), regenerate=False, user=default_user)
|
2021-10-03 04:46:29 +02:00
|
|
|
|
2023-10-26 18:42:29 +02:00
|
|
|
if os.getenv("GITHUB_PAT_TOKEN"):
|
|
|
|
GithubConfig.objects.create(
|
|
|
|
pat_token=os.getenv("GITHUB_PAT_TOKEN"),
|
|
|
|
user=default_user,
|
2023-03-01 02:18:04 +01:00
|
|
|
)
|
|
|
|
|
2023-10-26 18:42:29 +02:00
|
|
|
GithubRepoConfig.objects.create(
|
|
|
|
owner="khoj-ai",
|
|
|
|
name="lantern",
|
|
|
|
branch="master",
|
|
|
|
github_config=GithubConfig.objects.get(user=default_user),
|
2023-08-31 21:55:17 +02:00
|
|
|
)
|
|
|
|
|
2023-10-26 18:42:29 +02:00
|
|
|
LocalPlaintextConfig.objects.create(
|
2023-08-27 20:24:30 +02:00
|
|
|
input_files=None,
|
|
|
|
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
|
2023-10-26 18:42:29 +02:00
|
|
|
user=default_user,
|
2023-03-01 02:18:04 +01:00
|
|
|
)
|
|
|
|
|
2022-09-10 21:11:43 +02:00
|
|
|
return content_config
|
2023-01-09 20:17:36 +01:00
|
|
|
|
|
|
|
|
2023-03-01 02:26:06 +01:00
|
|
|
@pytest.fixture(scope="session")
|
2023-10-26 18:42:29 +02:00
|
|
|
def md_content_config():
|
|
|
|
markdown_config = LocalMarkdownConfig.objects.create(
|
2023-03-15 21:26:19 +01:00
|
|
|
input_files=None,
|
2023-06-29 20:53:47 +02:00
|
|
|
input_filter=["tests/data/markdown/*.markdown"],
|
2023-03-15 21:26:19 +01:00
|
|
|
)
|
|
|
|
|
2023-10-26 18:42:29 +02:00
|
|
|
return markdown_config
|
2023-03-15 21:26:19 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def processor_config(tmp_path_factory):
|
|
|
|
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
|
|
processor_dir = tmp_path_factory.mktemp("processor")
|
|
|
|
|
|
|
|
# The conversation processor is the only configured processor
|
|
|
|
# It needs an OpenAI API key to work.
|
|
|
|
if not openai_api_key:
|
|
|
|
return
|
|
|
|
|
|
|
|
# Setup conversation processor, if OpenAI API key is set
|
|
|
|
processor_config = ProcessorConfig()
|
|
|
|
processor_config.conversation = ConversationProcessorConfig(
|
2023-07-27 20:27:32 +02:00
|
|
|
openai=OpenAIProcessorConfig(api_key=openai_api_key),
|
2023-03-15 21:26:19 +01:00
|
|
|
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
|
|
|
)
|
|
|
|
|
|
|
|
return processor_config
|
|
|
|
|
|
|
|
|
2023-08-01 05:24:52 +02:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def processor_config_offline_chat(tmp_path_factory):
|
|
|
|
processor_dir = tmp_path_factory.mktemp("processor")
|
|
|
|
|
2023-08-01 18:24:03 +02:00
|
|
|
# Setup conversation processor
|
2023-08-01 05:24:52 +02:00
|
|
|
processor_config = ProcessorConfig()
|
2023-10-15 23:19:29 +02:00
|
|
|
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
|
2023-08-01 05:24:52 +02:00
|
|
|
processor_config.conversation = ConversationProcessorConfig(
|
2023-10-15 23:19:29 +02:00
|
|
|
offline_chat=offline_chat,
|
2023-08-01 05:24:52 +02:00
|
|
|
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
|
|
|
)
|
|
|
|
|
|
|
|
return processor_config
|
|
|
|
|
|
|
|
|
2023-03-15 21:26:19 +01:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig):
|
|
|
|
# Initialize app state
|
|
|
|
state.config.search_type = search_config
|
|
|
|
state.SearchType = configure_search_types(state.config)
|
|
|
|
|
|
|
|
# Index Markdown Content for Search
|
2023-10-26 18:42:29 +02:00
|
|
|
all_files = fs_syncer.collect_files()
|
2023-09-18 23:41:26 +02:00
|
|
|
state.content_index = configure_content(
|
|
|
|
state.content_index, state.config.content_type, all_files, state.search_models
|
2023-03-15 21:26:19 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
# Initialize Processor from Config
|
|
|
|
state.processor_config = configure_processor(processor_config)
|
2023-10-26 18:42:29 +02:00
|
|
|
state.anonymous_mode = True
|
|
|
|
|
|
|
|
app = FastAPI()
|
2023-03-15 21:26:19 +01:00
|
|
|
|
|
|
|
configure_routes(app)
|
2023-10-15 04:39:13 +02:00
|
|
|
configure_middleware(app)
|
|
|
|
app.mount("/static", StaticFiles(directory=web_directory), name="static")
|
2023-03-15 21:26:19 +01:00
|
|
|
return TestClient(app)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
2023-10-26 18:42:29 +02:00
|
|
|
def fastapi_app():
|
|
|
|
app = FastAPI()
|
|
|
|
configure_routes(app)
|
|
|
|
configure_middleware(app)
|
|
|
|
app.mount("/static", StaticFiles(directory=web_directory), name="static")
|
|
|
|
return app
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
|
|
def client(
|
|
|
|
content_config: ContentConfig,
|
|
|
|
search_config: SearchConfig,
|
|
|
|
processor_config: ProcessorConfig,
|
|
|
|
default_user: KhojUser,
|
|
|
|
):
|
2023-03-01 02:26:06 +01:00
|
|
|
state.config.content_type = content_config
|
|
|
|
state.config.search_type = search_config
|
|
|
|
state.SearchType = configure_search_types(state.config)
|
|
|
|
|
2023-06-14 01:32:47 +02:00
|
|
|
# These lines help us Mock the Search models for these search types
|
2023-07-14 10:19:38 +02:00
|
|
|
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
2023-10-26 18:42:29 +02:00
|
|
|
text_search.setup(
|
2023-08-31 21:55:17 +02:00
|
|
|
OrgToJsonl,
|
|
|
|
get_sample_data("org"),
|
|
|
|
regenerate=False,
|
2023-10-26 18:42:29 +02:00
|
|
|
user=default_user,
|
2023-07-14 10:19:38 +02:00
|
|
|
)
|
|
|
|
state.content_index.image = image_search.setup(
|
|
|
|
content_config.image, state.search_models.image_search, regenerate=False
|
|
|
|
)
|
2023-10-26 18:42:29 +02:00
|
|
|
text_search.setup(
|
2023-08-31 21:55:17 +02:00
|
|
|
PlaintextToJsonl,
|
|
|
|
get_sample_data("plaintext"),
|
|
|
|
regenerate=False,
|
2023-10-26 18:42:29 +02:00
|
|
|
user=default_user,
|
2023-08-31 21:55:17 +02:00
|
|
|
)
|
2023-06-14 01:32:47 +02:00
|
|
|
|
2023-08-01 05:24:52 +02:00
|
|
|
state.processor_config = configure_processor(processor_config)
|
2023-10-26 18:42:29 +02:00
|
|
|
state.anonymous_mode = True
|
2023-08-01 05:24:52 +02:00
|
|
|
|
|
|
|
configure_routes(app)
|
2023-10-15 04:39:13 +02:00
|
|
|
configure_middleware(app)
|
|
|
|
app.mount("/static", StaticFiles(directory=web_directory), name="static")
|
2023-08-01 05:24:52 +02:00
|
|
|
return TestClient(app)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
|
|
def client_offline_chat(
|
2023-09-19 00:05:15 +02:00
|
|
|
search_config: SearchConfig,
|
|
|
|
processor_config_offline_chat: ProcessorConfig,
|
|
|
|
content_config: ContentConfig,
|
|
|
|
md_content_config,
|
2023-08-01 05:24:52 +02:00
|
|
|
):
|
2023-08-27 03:11:18 +02:00
|
|
|
# Initialize app state
|
|
|
|
state.config.content_type = md_content_config
|
2023-08-01 05:24:52 +02:00
|
|
|
state.config.search_type = search_config
|
|
|
|
state.SearchType = configure_search_types(state.config)
|
|
|
|
|
2023-08-27 03:11:18 +02:00
|
|
|
# Index Markdown Content for Search
|
2023-08-31 21:55:17 +02:00
|
|
|
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
|
|
|
|
2023-09-19 00:05:15 +02:00
|
|
|
all_files = fs_syncer.collect_files(state.config.content_type)
|
2023-09-18 23:41:26 +02:00
|
|
|
state.content_index = configure_content(
|
|
|
|
state.content_index, state.config.content_type, all_files, state.search_models
|
2023-08-01 05:24:52 +02:00
|
|
|
)
|
|
|
|
|
2023-08-27 03:11:18 +02:00
|
|
|
# Initialize Processor from Config
|
2023-08-01 05:24:52 +02:00
|
|
|
state.processor_config = configure_processor(processor_config_offline_chat)
|
2023-10-26 18:42:29 +02:00
|
|
|
state.anonymous_mode = True
|
2023-08-01 05:24:52 +02:00
|
|
|
|
2023-03-01 02:26:06 +01:00
|
|
|
configure_routes(app)
|
2023-10-15 04:39:13 +02:00
|
|
|
configure_middleware(app)
|
|
|
|
app.mount("/static", StaticFiles(directory=web_directory), name="static")
|
2023-03-01 02:26:06 +01:00
|
|
|
return TestClient(app)
|
|
|
|
|
|
|
|
|
2023-02-17 17:04:26 +01:00
|
|
|
@pytest.fixture(scope="function")
|
2023-10-26 18:42:29 +02:00
|
|
|
def new_org_file(default_user: KhojUser, content_config: ContentConfig):
|
2023-01-09 20:17:36 +01:00
|
|
|
# Setup
|
2023-10-26 18:42:29 +02:00
|
|
|
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
|
|
|
input_filters = org_config.input_filter
|
|
|
|
new_org_file = Path(input_filters[0]).parent / "new_file.org"
|
2023-01-09 20:17:36 +01:00
|
|
|
new_org_file.touch()
|
|
|
|
|
|
|
|
yield new_org_file
|
|
|
|
|
|
|
|
# Cleanup
|
|
|
|
if new_org_file.exists():
|
|
|
|
new_org_file.unlink()
|
|
|
|
|
|
|
|
|
2023-02-17 17:04:26 +01:00
|
|
|
@pytest.fixture(scope="function")
|
2023-10-26 18:42:29 +02:00
|
|
|
def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
|
|
|
|
LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None)
|
|
|
|
return LocalOrgConfig.objects.filter(user=default_user).first()
|
2023-08-31 21:55:17 +02:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
|
|
def sample_org_data():
|
|
|
|
return get_sample_data("org")
|
|
|
|
|
|
|
|
|
|
|
|
def get_sample_data(type):
|
|
|
|
sample_data = {
|
|
|
|
"org": {
|
|
|
|
"readme.org": """
|
|
|
|
* Khoj
|
|
|
|
/Allow natural language search on user content like notes, images using transformer based models/
|
|
|
|
|
|
|
|
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
|
|
|
|
|
|
|
|
** Dependencies
|
|
|
|
- Python3
|
|
|
|
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
|
|
|
|
|
|
|
** Install
|
|
|
|
#+begin_src shell
|
|
|
|
git clone https://github.com/khoj-ai/khoj && cd khoj
|
|
|
|
conda env create -f environment.yml
|
|
|
|
conda activate khoj
|
|
|
|
#+end_src"""
|
|
|
|
},
|
|
|
|
"markdown": {
|
|
|
|
"readme.markdown": """
|
|
|
|
# Khoj
|
|
|
|
Allow natural language search on user content like notes, images using transformer based models
|
|
|
|
|
|
|
|
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
|
|
|
|
|
|
|
|
## Dependencies
|
|
|
|
- Python3
|
|
|
|
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
|
|
|
|
|
|
|
|
## Install
|
|
|
|
```shell
|
|
|
|
git clone
|
|
|
|
conda env create -f environment.yml
|
|
|
|
conda activate khoj
|
|
|
|
```
|
|
|
|
"""
|
|
|
|
},
|
|
|
|
"plaintext": {
|
|
|
|
"readme.txt": """
|
|
|
|
Khoj
|
|
|
|
Allow natural language search on user content like notes, images using transformer based models
|
|
|
|
|
|
|
|
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
|
|
|
|
|
|
|
|
Dependencies
|
|
|
|
- Python3
|
|
|
|
- Miniconda
|
|
|
|
|
|
|
|
Install
|
|
|
|
git clone
|
|
|
|
conda env create -f environment.yml
|
|
|
|
conda activate khoj
|
|
|
|
"""
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
return sample_data[type]
|