From 88d1a29a849e491f5fcd3c2b3a993d56df6d6103 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 17:18:07 -0700 Subject: [PATCH] Test index is stable for duplicate entries across regenerate, update - Current incorrect behavior: All entries with duplicate compiled form are kept on regenerate but on update only the last of the duplicated entries is kept This divergent behavior is not ideal to prevent index corruption across reconfigure and update --- tests/test_text_search.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 5809f327..3e8f7d3d 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -161,6 +161,40 @@ def test_asymmetric_reload(content_config: ContentConfig, search_models: SearchM content_config.org.input_files = [] +# ---------------------------------------------------------------------------------------------------- +def test_update_index_with_duplicate_entries_in_stable_order( + org_config_with_only_new_file: TextContentConfig, search_models: SearchModels +): + # Arrange + new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) + + # Insert org-mode entries with same compiled form into new org file + new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" + with open(new_file_to_index, "w") as f: + f.write(f"{new_entry}{new_entry}") + + # Act + # load embeddings, entries, notes model after adding new org-mode file + initial_index = text_search.setup( + OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True + ) + + # update embeddings, entries, notes model after adding new org-mode file + updated_index = text_search.setup( + OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False + ) + + # Assert + # verify only 1 entry added even if there are multiple duplicate entries + assert len(initial_index.entries) == len(updated_index.entries) == 1 + assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) == 1 + + # verify the same entry is added even when there are multiple duplicate entries + error_details = compare_index(initial_index, updated_index) + if error_details: + pytest.fail(error_details) + + # ---------------------------------------------------------------------------------------------------- def test_incremental_update(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path): # Arrange