Test text search index only updates on changes to text content

This commit is contained in:
Debanjum Singh Solanky 2023-05-12 17:37:34 +08:00
parent f9ccce430e
commit cc75f986b2
2 changed files with 23 additions and 0 deletions

View file

@ -73,6 +73,7 @@ def compute_embeddings(
# Encode any new entries in the corpus and update corpus embeddings
new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1]
if new_entries:
logger.info(f"📩 Indexing {len(new_entries)} text entries.")
new_embeddings = bi_encoder.encode(
new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True
)
@ -87,6 +88,7 @@ def compute_embeddings(
# Else compute the corpus embeddings from scratch
else:
new_entries = [entry.compiled for _, entry in entries_with_ids]
logger.info(f"📩 Indexing {len(new_entries)} text entries. Creating index from scratch.")
corpus_embeddings = bi_encoder.encode(
new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True
)

View file

@ -1,4 +1,5 @@
# System Packages
import logging
from pathlib import Path
# External Packages
@ -48,6 +49,26 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
assert len(notes_model.corpus_embeddings) == 10
# ----------------------------------------------------------------------------------------------------
def test_text_content_index_only_updates_on_changes(content_config: ContentConfig, search_config: SearchConfig, caplog):
# Arrange
caplog.set_level(logging.INFO, logger="khoj")
# Act
# Generate initial notes embeddings during asymmetric setup
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
initial_logs = caplog.text
caplog.clear() # Clear logs
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
final_logs = caplog.text
# Assert
assert "📩 Saved computed text embeddings to" in initial_logs
assert "📩 Saved computed text embeddings to" not in final_logs
# ----------------------------------------------------------------------------------------------------
def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange