From 2b58218b5637c22c44bf8152eb4dc11b5aef038b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 14:15:43 +0300 Subject: [PATCH] Reuse search models across sessions. Merge unused pytest fixtures - Remove unused model_dir pytest fixture. It was only being used by the content_config fixture, not by any tests - Reuse existing search models downloaded to khoj directory. Downloading search models for each pytest sessions seems excessive and slows down tests quite a bit --- tests/conftest.py | 45 ++++++++++++++------------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7545527f..fdb26557 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import pytest # Internal Packages from src.search_type import image_search, text_search from src.utils.config import SearchType +from src.utils.helpers import resolve_absolute_path from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig from src.processor.org_mode.org_to_jsonl import org_to_jsonl from src.search_filter.date_filter import DateFilter @@ -12,41 +13,41 @@ from src.search_filter.file_filter import FileFilter @pytest.fixture(scope='session') -def search_config(tmp_path_factory) -> SearchConfig: - model_dir = tmp_path_factory.mktemp('data') - +def search_config() -> SearchConfig: + model_dir = resolve_absolute_path('~/.khoj/search') + model_dir.mkdir(parents=True, exist_ok=True) search_config = SearchConfig() search_config.symmetric = TextSearchConfig( encoder = "sentence-transformers/all-MiniLM-L6-v2", cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2", - model_directory = model_dir + model_directory = model_dir / 'symmetric/' ) search_config.asymmetric = TextSearchConfig( encoder = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2", - model_directory = model_dir + model_directory = model_dir / 'asymmetric/' ) search_config.image = ImageSearchConfig( encoder = "sentence-transformers/clip-ViT-B-32", - model_directory = model_dir + model_directory = model_dir / 'image/' ) return search_config @pytest.fixture(scope='session') -def model_dir(search_config: SearchConfig): - model_dir = search_config.asymmetric.model_directory +def content_config(tmp_path_factory, search_config: SearchConfig): + content_dir = tmp_path_factory.mktemp('content') # Generate Image Embeddings from Test Images content_config = ContentConfig() content_config.image = ImageContentConfig( input_directories = ['tests/data/images'], - embeddings_file = model_dir.joinpath('image_embeddings.pt'), - batch_size = 10, + embeddings_file = content_dir.joinpath('image_embeddings.pt'), + batch_size = 1, use_xmp_metadata = False) image_search.setup(content_config.image, search_config.image, regenerate=False) @@ -55,28 +56,10 @@ def model_dir(search_config: SearchConfig): content_config.org = TextContentConfig( input_files = None, input_filter = 'tests/data/org/*.org', - compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'), - embeddings_file = model_dir.joinpath('note_embeddings.pt')) + compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'), + embeddings_file = content_dir.joinpath('note_embeddings.pt')) - filters = [DateFilter(), WordFilter(model_dir, search_type=SearchType.Org), FileFilter()] + filters = [DateFilter(), WordFilter(content_dir, search_type=SearchType.Org), FileFilter()] text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) - return model_dir - - -@pytest.fixture(scope='session') -def content_config(model_dir) -> ContentConfig: - content_config = ContentConfig() - content_config.org = TextContentConfig( - input_files = None, - input_filter = 'tests/data/org/*.org', - compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'), - embeddings_file = model_dir.joinpath('note_embeddings.pt')) - - content_config.image = ImageContentConfig( - input_directories = ['tests/data/images'], - embeddings_file = model_dir.joinpath('image_embeddings.pt'), - batch_size = 1, - use_xmp_metadata = False) - return content_config \ No newline at end of file