khoj/tests/conftest.py

# External Packages
import os
from copy import deepcopy
from fastapi.testclient import TestClient
from pathlib import Path
import pytest

# Internal Packages
from khoj.main import app
from khoj.configure import configure_processor, configure_routes, configure_search_types
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.search_type import image_search, text_search
from khoj.utils.config import SearchModels
from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import (
    ContentConfig,
    ConversationProcessorConfig,
    OpenAIProcessorConfig,
    ProcessorConfig,
    TextContentConfig,
    GithubContentConfig,
    GithubRepoConfig,
    ImageContentConfig,
    SearchConfig,
    TextSearchConfig,
    ImageSearchConfig,
)
from khoj.utils import state
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.search_filter.date_filter import DateFilter
from khoj.search_filter.word_filter import WordFilter
from khoj.search_filter.file_filter import FileFilter


@pytest.fixture(scope="session")
def search_config() -> SearchConfig:
    model_dir = resolve_absolute_path("~/.khoj/search")
    model_dir.mkdir(parents=True, exist_ok=True)
    search_config = SearchConfig()

    search_config.symmetric = TextSearchConfig(
        encoder="sentence-transformers/all-MiniLM-L6-v2",
        cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
        model_directory=model_dir / "symmetric/",
        encoder_type=None,
    )

    search_config.asymmetric = TextSearchConfig(
        encoder="sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
        cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
        model_directory=model_dir / "asymmetric/",
        encoder_type=None,
    )

    search_config.image = ImageSearchConfig(
        encoder="sentence-transformers/clip-ViT-B-32",
        model_directory=model_dir / "image/",
        encoder_type=None,
    )

    return search_config


@pytest.fixture(scope="session")
def search_models(search_config: SearchConfig):
    search_models = SearchModels()
    search_models.text_search = text_search.initialize_model(search_config.asymmetric)
    search_models.image_search = image_search.initialize_model(search_config.image)

    return search_models


@pytest.fixture(scope="session")
def content_config(tmp_path_factory, search_models: SearchModels, search_config: SearchConfig):
    content_dir = tmp_path_factory.mktemp("content")

    # Generate Image Embeddings from Test Images
    content_config = ContentConfig()
    content_config.image = ImageContentConfig(
        input_filter=None,
        input_directories=["tests/data/images"],
        embeddings_file=content_dir.joinpath("image_embeddings.pt"),
        batch_size=1,
        use_xmp_metadata=False,
    )

    image_search.setup(content_config.image, search_models.image_search.image_encoder, regenerate=False)

    # Generate Notes Embeddings from Test Notes
    content_config.org = TextContentConfig(
        input_files=None,
        input_filter=["tests/data/org/*.org"],
        compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
        embeddings_file=content_dir.joinpath("note_embeddings.pt"),
    )

    filters = [DateFilter(), WordFilter(), FileFilter()]
    text_search.setup(
        OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
    )

    content_config.plugins = {
        "plugin1": TextContentConfig(
            input_files=[content_dir.joinpath("notes.jsonl.gz")],
            input_filter=None,
            compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
            embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
        )
    }

    content_config.plaintext = TextContentConfig(
        input_files=None,
        input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
        compressed_jsonl=content_dir.joinpath("plaintext.jsonl.gz"),
        embeddings_file=content_dir.joinpath("plaintext_embeddings.pt"),
    )

    content_config.github = GithubContentConfig(
        pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
        repos=[
            GithubRepoConfig(
                owner="khoj-ai",
                name="lantern",
                branch="master",
            )
        ],
        compressed_jsonl=content_dir.joinpath("github.jsonl.gz"),
        embeddings_file=content_dir.joinpath("github_embeddings.pt"),
    )

    filters = [DateFilter(), WordFilter(), FileFilter()]
    text_search.setup(
        JsonlToJsonl,
        content_config.plugins["plugin1"],
        search_models.text_search.bi_encoder,
        regenerate=False,
        filters=filters,
    )

    return content_config


@pytest.fixture(scope="session")
def md_content_config(tmp_path_factory):
    content_dir = tmp_path_factory.mktemp("content")

    # Generate Embeddings for Markdown Content
    content_config = ContentConfig()
    content_config.markdown = TextContentConfig(
        input_files=None,
        input_filter=["tests/data/markdown/*.markdown"],
        compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"),
        embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
    )

    return content_config


@pytest.fixture(scope="session")
def processor_config(tmp_path_factory):
    openai_api_key = os.getenv("OPENAI_API_KEY")
    processor_dir = tmp_path_factory.mktemp("processor")

    # The conversation processor is the only configured processor
    # It needs an OpenAI API key to work.
    if not openai_api_key:
        return

    # Setup conversation processor, if OpenAI API key is set
    processor_config = ProcessorConfig()
    processor_config.conversation = ConversationProcessorConfig(
        openai=OpenAIProcessorConfig(api_key=openai_api_key),
        conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
    )

    return processor_config


@pytest.fixture(scope="session")
def processor_config_offline_chat(tmp_path_factory):
    processor_dir = tmp_path_factory.mktemp("processor")

    # Setup conversation processor
    processor_config = ProcessorConfig()
    processor_config.conversation = ConversationProcessorConfig(
        enable_offline_chat=True,
        conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
    )

    return processor_config


@pytest.fixture(scope="session")
def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig):
    # Initialize app state
    state.config.content_type = md_content_config
    state.config.search_type = search_config
    state.SearchType = configure_search_types(state.config)

    # Index Markdown Content for Search
    filters = [DateFilter(), WordFilter(), FileFilter()]
    state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
    state.content_index.markdown = text_search.setup(
        MarkdownToJsonl,
        md_content_config.markdown,
        state.search_models.text_search.bi_encoder,
        regenerate=False,
        filters=filters,
    )

    # Initialize Processor from Config
    state.processor_config = configure_processor(processor_config)

    configure_routes(app)
    return TestClient(app)


@pytest.fixture(scope="function")
def client(content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig):
    state.config.content_type = content_config
    state.config.search_type = search_config
    state.SearchType = configure_search_types(state.config)

    # These lines help us Mock the Search models for these search types
    state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
    state.search_models.image_search = image_search.initialize_model(search_config.image)
    state.content_index.org = text_search.setup(
        OrgToJsonl, content_config.org, state.search_models.text_search.bi_encoder, regenerate=False
    )
    state.content_index.image = image_search.setup(
        content_config.image, state.search_models.image_search, regenerate=False
    )

    state.processor_config = configure_processor(processor_config)

    configure_routes(app)
    return TestClient(app)


@pytest.fixture(scope="function")
def client_offline_chat(
    md_content_config: ContentConfig, search_config: SearchConfig, processor_config_offline_chat: ProcessorConfig
):
    # Initialize app state
    state.config.content_type = md_content_config
    state.config.search_type = search_config
    state.SearchType = configure_search_types(state.config)

    # Index Markdown Content for Search
    filters = [DateFilter(), WordFilter(), FileFilter()]
    state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
    state.content_index.markdown = text_search.setup(
        MarkdownToJsonl,
        md_content_config.markdown,
        state.search_models.text_search.bi_encoder,
        regenerate=False,
        filters=filters,
    )

    # Initialize Processor from Config
    state.processor_config = configure_processor(processor_config_offline_chat)

    configure_routes(app)
    return TestClient(app)


@pytest.fixture(scope="function")
def new_org_file(content_config: ContentConfig):
    # Setup
    new_org_file = Path(content_config.org.input_filter[0]).parent / "new_file.org"
    new_org_file.touch()

    yield new_org_file

    # Cleanup
    if new_org_file.exists():
        new_org_file.unlink()


@pytest.fixture(scope="function")
def org_config_with_only_new_file(content_config: ContentConfig, new_org_file: Path):
    new_org_config = deepcopy(content_config.org)
    new_org_config.input_files = [f"{new_org_file}"]
    new_org_config.input_filter = None
    return new_org_config