From e057c8e20838b18c79453c962c949ab1d209898f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 23 Dec 2022 15:45:53 -0300 Subject: [PATCH 1/7] Add method to split entries by specified max tokens limit - Issue ML Models truncate entries exceeding some max token limit. This lowers the quality of search results - Fix Split entries by max tokens before indexing. This should improve searching for content in longer entries. - Miscellaneous - Test method to split entries by max tokens --- src/processor/text_to_jsonl.py | 13 +++++++++++++ tests/test_org_to_jsonl.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/processor/text_to_jsonl.py b/src/processor/text_to_jsonl.py index 2f5e7e40..0eb60e6c 100644 --- a/src/processor/text_to_jsonl.py +++ b/src/processor/text_to_jsonl.py @@ -23,6 +23,19 @@ class TextToJsonl(ABC): def hash_func(key: str) -> Callable: return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding='utf-8')).hexdigest() + @staticmethod + def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256) -> list[Entry]: + "Split entries if compiled entry length exceeds the max tokens supported by the ML model." + chunked_entries: list[Entry] = [] + for entry in entries: + compiled_entry_words = entry.compiled.split() + for chunk_index in range(0, len(compiled_entry_words), max_tokens): + compiled_entry_words_chunk = compiled_entry_words[chunk_index:chunk_index + max_tokens] + compiled_entry_chunk = ' '.join(compiled_entry_words_chunk) + entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file) + chunked_entries.append(entry_chunk) + return chunked_entries + def mark_entries_for_update(self, current_entries: list[Entry], previous_entries: list[Entry], key='compiled', logger=None) -> list[tuple[int, Entry]]: # Hash all current and previous entries to identify new entries start = time.time() diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 2dbedcd0..ee9aae16 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -3,6 +3,7 @@ import json # Internal Packages from src.processor.org_mode.org_to_jsonl import OrgToJsonl +from src.processor.text_to_jsonl import TextToJsonl from src.utils.helpers import is_none_or_empty @@ -35,6 +36,34 @@ def test_configure_heading_entry_to_jsonl(tmp_path): assert is_none_or_empty(jsonl_data) +def test_entry_split_when_exceeds_max_words(tmp_path): + "Ensure entries with compiled words exceeding max_words are split." + # Arrange + entry = f'''*** Heading + :PROPERTIES: + :ID: 42-42-42 + :END: + \t\r + Body Line 1 + ''' + orgfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Org files + entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile]) + + # Split Each Entry from specified Org files by Max Words + jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( + TextToJsonl.split_entries_by_max_tokens( + OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), + max_tokens = 2) + ) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 2 + + def test_entry_with_body_to_jsonl(tmp_path): "Ensure entries with valid body text are loaded." # Arrange From c79919bd68a2b02eed99932ec9991540f47b00b1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 23 Dec 2022 15:52:02 -0300 Subject: [PATCH 2/7] Split entries by max tokens while converting Org entries To JSONL - Test usage the entry splitting by max tokens in text search --- src/processor/org_mode/org_to_jsonl.py | 5 ++++ tests/test_text_search.py | 33 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 52441a99..4a4bd598 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -43,6 +43,11 @@ class OrgToJsonl(TextToJsonl): end = time.time() logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") + start = time.time() + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + end = time.time() + logger.debug(f"Split entries by max token size supported by model: {end - start} seconds") + # Identify, mark and merge any new entries with previous entries if not previous_entries: entries_with_ids = list(enumerate(current_entries)) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index e05831a1..1a1d65d9 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -80,6 +80,39 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC assert "git clone" in search_result +# ---------------------------------------------------------------------------------------------------- +def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig): + # Arrange + initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) + + assert len(initial_notes_model.entries) == 10 + assert len(initial_notes_model.corpus_embeddings) == 10 + + file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org" + content_config.org.input_files = [f'{file_to_add_on_reload}'] + + # Append Org-Mode Entry with size exceeding max token limit to new Org File in Config + max_tokens = 256 + with open(file_to_add_on_reload, "w") as f: + f.write(f"* Entry more than {max_tokens} words\n") + for index in range(max_tokens+1): + f.write(f"{index} ") + + # Act + # reload embeddings, entries, notes model after adding new org-mode file + initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) + + # Assert + # verify newly added org-mode entry is split by max tokens + assert len(initial_notes_model.entries) == 12 + assert len(initial_notes_model.corpus_embeddings) == 12 + + # Cleanup + # delete reload test file added + content_config.org.input_files = [] + file_to_add_on_reload.unlink() + + # ---------------------------------------------------------------------------------------------------- def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig): # Arrange From 53cd2e5605b856300137895b5fd294244c67cc55 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 23 Dec 2022 16:18:22 -0300 Subject: [PATCH 3/7] Regenerate initial model in asymmetric reload test to reduce flakyness - Fix logger message when converting org node to entries - Remove unused import from conftest --- src/processor/org_mode/org_to_jsonl.py | 2 +- tests/conftest.py | 1 - tests/test_text_search.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 4a4bd598..313c9a3f 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -41,7 +41,7 @@ class OrgToJsonl(TextToJsonl): start = time.time() current_entries = self.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) end = time.time() - logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") + logger.debug(f"Convert OrgNodes into list of entries: {end - start} seconds") start = time.time() current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) diff --git a/tests/conftest.py b/tests/conftest.py index 103a28e8..ec87f964 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,6 @@ import pytest # Internal Packages from src.search_type import image_search, text_search -from src.utils.config import SearchType from src.utils.helpers import resolve_absolute_path from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig from src.processor.org_mode.org_to_jsonl import OrgToJsonl diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 1a1d65d9..528b2f31 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -116,7 +116,7 @@ def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_conf # ---------------------------------------------------------------------------------------------------- def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig): # Arrange - initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) + initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True) assert len(initial_notes_model.entries) == 10 assert len(initial_notes_model.corpus_embeddings) == 10 From b283650991c1a218b146715f4a8288efb41e3fa4 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 23 Dec 2022 19:01:39 -0300 Subject: [PATCH 4/7] Deduplicate results for user query by raw text before returning results - Required because entries are now split by the max_word count supported by the ML models - This would now result in potentially duplicate hits, entries being returned to user - Do deduplication after ranking to get the top ranked deduplicated results --- src/search_type/text_search.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 8b29c517..bc3725d9 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -150,6 +150,17 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False): end = time.time() logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}") + # Deduplicate entries by raw entry text + # Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries + start = time.time() + seen, original_hits_count = set(), len(hits) + hits = [hit for hit in hits + if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)] + duplicate_hits = original_hits_count - len(hits) + end = time.time() + logger.debug(f"Removed {duplicate_hits} Duplicate Hits") + logger.debug(f"Deduplication Time: {end - start:.3f} seconds") + return hits, entries From 24676f95d8d9bf1bd5526aa44f60827ee74dc6fe Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 25 Dec 2022 21:45:40 -0300 Subject: [PATCH 5/7] Fix comments, use minimal test case, regenerate test index, merge debug logs - Remove property drawer from test entry for max_words splitting test - Property drawer is not required for the test - Keep minimal test case to reduce chance for confusion --- src/search_type/text_search.py | 8 ++++---- tests/test_org_to_jsonl.py | 5 +---- tests/test_text_search.py | 4 ++-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index bc3725d9..5bbbdd64 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -150,16 +150,16 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False): end = time.time() logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}") - # Deduplicate entries by raw entry text - # Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries + # Deduplicate entries by raw entry text before showing to users + # Compiled entries are split by max tokens supported by ML models. + # This can result in duplicate hits, entries shown to user. start = time.time() seen, original_hits_count = set(), len(hits) hits = [hit for hit in hits if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)] duplicate_hits = original_hits_count - len(hits) end = time.time() - logger.debug(f"Removed {duplicate_hits} Duplicate Hits") - logger.debug(f"Deduplication Time: {end - start:.3f} seconds") + logger.debug(f"Deduplication Time: {end - start:.3f} seconds. Removed {duplicate_hits} duplicates") return hits, entries diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index ee9aae16..fe64cc67 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -40,9 +40,6 @@ def test_entry_split_when_exceeds_max_words(tmp_path): "Ensure entries with compiled words exceeding max_words are split." # Arrange entry = f'''*** Heading - :PROPERTIES: - :ID: 42-42-42 - :END: \t\r Body Line 1 ''' @@ -52,7 +49,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Extract Entries from specified Org files entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile]) - # Split Each Entry from specified Org files by Max Words + # Split each entry from specified Org files by max words jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( TextToJsonl.split_entries_by_max_tokens( OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 528b2f31..dcacf7fb 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -83,7 +83,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC # ---------------------------------------------------------------------------------------------------- def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig): # Arrange - initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) + initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True) assert len(initial_notes_model.entries) == 10 assert len(initial_notes_model.corpus_embeddings) == 10 @@ -91,7 +91,7 @@ def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_conf file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org" content_config.org.input_files = [f'{file_to_add_on_reload}'] - # Append Org-Mode Entry with size exceeding max token limit to new Org File in Config + # Insert org-mode entry with size exceeding max token limit to new org file max_tokens = 256 with open(file_to_add_on_reload, "w") as f: f.write(f"* Entry more than {max_tokens} words\n") From f209e30a3b320f3a0fd616b7cb3fffe2e2cd0847 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 26 Dec 2022 13:14:15 -0300 Subject: [PATCH 6/7] Split entries by max tokens while converting Markdown entries To JSONL --- src/processor/markdown/markdown_to_jsonl.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 5c4d660d..17482de5 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -35,6 +35,12 @@ class MarkdownToJsonl(TextToJsonl): end = time.time() logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds") + # Split entries by max tokens supported by model + start = time.time() + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + end = time.time() + logger.debug(f"Split entries by max token size supported by model: {end - start} seconds") + # Identify, mark and merge any new entries with previous entries start = time.time() if not previous_entries: From 17fa123b4ec858b846fa8ab4f0e4dbc84360c884 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 26 Dec 2022 15:14:32 -0300 Subject: [PATCH 7/7] Split entries by max tokens while converting Beancount entries To JSONL --- src/processor/ledger/beancount_to_jsonl.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index ccad97da..9f37df70 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -35,6 +35,12 @@ class BeancountToJsonl(TextToJsonl): end = time.time() logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds") + # Split entries by max tokens supported by model + start = time.time() + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + end = time.time() + logger.debug(f"Split entries by max token size supported by model: {end - start} seconds") + # Identify, mark and merge any new entries with previous entries start = time.time() if not previous_entries: