From 24676f95d8d9bf1bd5526aa44f60827ee74dc6fe Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 25 Dec 2022 21:45:40 -0300 Subject: [PATCH] Fix comments, use minimal test case, regenerate test index, merge debug logs - Remove property drawer from test entry for max_words splitting test - Property drawer is not required for the test - Keep minimal test case to reduce chance for confusion --- src/search_type/text_search.py | 8 ++++---- tests/test_org_to_jsonl.py | 5 +---- tests/test_text_search.py | 4 ++-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index bc3725d9..5bbbdd64 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -150,16 +150,16 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False): end = time.time() logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}") - # Deduplicate entries by raw entry text - # Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries + # Deduplicate entries by raw entry text before showing to users + # Compiled entries are split by max tokens supported by ML models. + # This can result in duplicate hits, entries shown to user. start = time.time() seen, original_hits_count = set(), len(hits) hits = [hit for hit in hits if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)] duplicate_hits = original_hits_count - len(hits) end = time.time() - logger.debug(f"Removed {duplicate_hits} Duplicate Hits") - logger.debug(f"Deduplication Time: {end - start:.3f} seconds") + logger.debug(f"Deduplication Time: {end - start:.3f} seconds. Removed {duplicate_hits} duplicates") return hits, entries diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index ee9aae16..fe64cc67 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -40,9 +40,6 @@ def test_entry_split_when_exceeds_max_words(tmp_path): "Ensure entries with compiled words exceeding max_words are split." # Arrange entry = f'''*** Heading - :PROPERTIES: - :ID: 42-42-42 - :END: \t\r Body Line 1 ''' @@ -52,7 +49,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Extract Entries from specified Org files entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile]) - # Split Each Entry from specified Org files by Max Words + # Split each entry from specified Org files by max words jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( TextToJsonl.split_entries_by_max_tokens( OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 528b2f31..dcacf7fb 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -83,7 +83,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC # ---------------------------------------------------------------------------------------------------- def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig): # Arrange - initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) + initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True) assert len(initial_notes_model.entries) == 10 assert len(initial_notes_model.corpus_embeddings) == 10 @@ -91,7 +91,7 @@ def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_conf file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org" content_config.org.input_files = [f'{file_to_add_on_reload}'] - # Append Org-Mode Entry with size exceeding max token limit to new Org File in Config + # Insert org-mode entry with size exceeding max token limit to new org file max_tokens = 256 with open(file_to_add_on_reload, "w") as f: f.write(f"* Entry more than {max_tokens} words\n")