From e057c8e20838b18c79453c962c949ab1d209898f Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 23 Dec 2022 15:45:53 -0300
Subject: [PATCH 1/7] Add method to split entries by specified max tokens limit

- Issue
   ML Models truncate entries exceeding some max token limit.
   This lowers the quality of search results

- Fix
  Split entries by max tokens before indexing.
  This should improve searching for content in longer entries.

- Miscellaneous
  - Test method to split entries by max tokens
---
 src/processor/text_to_jsonl.py | 13 +++++++++++++
 tests/test_org_to_jsonl.py     | 29 +++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/src/processor/text_to_jsonl.py b/src/processor/text_to_jsonl.py
index 2f5e7e40..0eb60e6c 100644
--- a/src/processor/text_to_jsonl.py
+++ b/src/processor/text_to_jsonl.py
@@ -23,6 +23,19 @@ class TextToJsonl(ABC):
     def hash_func(key: str) -> Callable:
         return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding='utf-8')).hexdigest()
 
+    @staticmethod
+    def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256) -> list[Entry]:
+        "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
+        chunked_entries: list[Entry] = []
+        for entry in entries:
+            compiled_entry_words = entry.compiled.split()
+            for chunk_index in range(0, len(compiled_entry_words), max_tokens):
+                compiled_entry_words_chunk = compiled_entry_words[chunk_index:chunk_index + max_tokens]
+                compiled_entry_chunk = ' '.join(compiled_entry_words_chunk)
+                entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file)
+                chunked_entries.append(entry_chunk)
+        return chunked_entries
+
     def mark_entries_for_update(self, current_entries: list[Entry], previous_entries: list[Entry], key='compiled', logger=None) -> list[tuple[int, Entry]]:
         # Hash all current and previous entries to identify new entries
         start = time.time()
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index 2dbedcd0..ee9aae16 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -3,6 +3,7 @@ import json
 
 # Internal Packages
 from src.processor.org_mode.org_to_jsonl import OrgToJsonl
+from src.processor.text_to_jsonl import TextToJsonl
 from src.utils.helpers import is_none_or_empty
 
 
@@ -35,6 +36,34 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
             assert is_none_or_empty(jsonl_data)
 
 
+def test_entry_split_when_exceeds_max_words(tmp_path):
+    "Ensure entries with compiled words exceeding max_words are split."
+    # Arrange
+    entry = f'''*** Heading
+    :PROPERTIES:
+    :ID:       42-42-42
+    :END:
+    \t\r
+    Body Line 1
+    '''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    # Extract Entries from specified Org files
+    entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
+
+    # Split Each Entry from specified Org files by Max Words
+    jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
+        TextToJsonl.split_entries_by_max_tokens(
+            OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
+            max_tokens = 2)
+        )
+    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
+
+    # Assert
+    assert len(jsonl_data) == 2
+
+
 def test_entry_with_body_to_jsonl(tmp_path):
     "Ensure entries with valid body text are loaded."
     # Arrange

From c79919bd68a2b02eed99932ec9991540f47b00b1 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 23 Dec 2022 15:52:02 -0300
Subject: [PATCH 2/7] Split entries by max tokens while converting Org entries
 To JSONL

- Test usage the entry splitting by max tokens in text search
---
 src/processor/org_mode/org_to_jsonl.py |  5 ++++
 tests/test_text_search.py              | 33 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py
index 52441a99..4a4bd598 100644
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@@ -43,6 +43,11 @@ class OrgToJsonl(TextToJsonl):
         end = time.time()
         logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
 
+        start = time.time()
+        current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
+        end = time.time()
+        logger.debug(f"Split entries by max token size supported by model: {end - start} seconds")
+
         # Identify, mark and merge any new entries with previous entries
         if not previous_entries:
             entries_with_ids = list(enumerate(current_entries))
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index e05831a1..1a1d65d9 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -80,6 +80,39 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
     assert "git clone" in search_result
 
 
+# ----------------------------------------------------------------------------------------------------
+def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig):
+    # Arrange
+    initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
+
+    assert len(initial_notes_model.entries) == 10
+    assert len(initial_notes_model.corpus_embeddings) == 10
+
+    file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org"
+    content_config.org.input_files = [f'{file_to_add_on_reload}']
+
+    # Append Org-Mode Entry with size exceeding max token limit to new Org File in Config
+    max_tokens = 256
+    with open(file_to_add_on_reload, "w") as f:
+        f.write(f"* Entry more than {max_tokens} words\n")
+        for index in range(max_tokens+1):
+            f.write(f"{index} ")
+
+    # Act
+    # reload embeddings, entries, notes model after adding new org-mode file
+    initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
+
+    # Assert
+    # verify newly added org-mode entry is split by max tokens
+    assert len(initial_notes_model.entries) == 12
+    assert len(initial_notes_model.corpus_embeddings) == 12
+
+    # Cleanup
+    # delete reload test file added
+    content_config.org.input_files = []
+    file_to_add_on_reload.unlink()
+
+
 # ----------------------------------------------------------------------------------------------------
 def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
     # Arrange

From 53cd2e5605b856300137895b5fd294244c67cc55 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 23 Dec 2022 16:18:22 -0300
Subject: [PATCH 3/7] Regenerate initial model in asymmetric reload test to
 reduce flakyness

- Fix logger message when converting org node to entries
- Remove unused import from conftest
---
 src/processor/org_mode/org_to_jsonl.py | 2 +-
 tests/conftest.py                      | 1 -
 tests/test_text_search.py              | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py
index 4a4bd598..313c9a3f 100644
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@@ -41,7 +41,7 @@ class OrgToJsonl(TextToJsonl):
         start = time.time()
         current_entries = self.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries)
         end = time.time()
-        logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
+        logger.debug(f"Convert OrgNodes into list of entries: {end - start} seconds")
 
         start = time.time()
         current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
diff --git a/tests/conftest.py b/tests/conftest.py
index 103a28e8..ec87f964 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,7 +3,6 @@ import pytest
 
 # Internal Packages
 from src.search_type import image_search, text_search
-from src.utils.config import SearchType
 from src.utils.helpers import resolve_absolute_path
 from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
 from src.processor.org_mode.org_to_jsonl import OrgToJsonl
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 1a1d65d9..528b2f31 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -116,7 +116,7 @@ def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_conf
 # ----------------------------------------------------------------------------------------------------
 def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
     # Arrange
-    initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
+    initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
 
     assert len(initial_notes_model.entries) == 10
     assert len(initial_notes_model.corpus_embeddings) == 10

From b283650991c1a218b146715f4a8288efb41e3fa4 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 23 Dec 2022 19:01:39 -0300
Subject: [PATCH 4/7] Deduplicate results for user query by raw text before
 returning results

- Required because entries are now split by the max_word count supported
  by the ML models
- This would now result in potentially duplicate hits, entries being
  returned to user
- Do deduplication after ranking to get the top ranked deduplicated
  results
---
 src/search_type/text_search.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py
index 8b29c517..bc3725d9 100644
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -150,6 +150,17 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False):
     end = time.time()
     logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")
 
+    # Deduplicate entries by raw entry text
+    # Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries
+    start = time.time()
+    seen, original_hits_count = set(), len(hits)
+    hits = [hit for hit in hits
+            if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)]
+    duplicate_hits = original_hits_count - len(hits)
+    end = time.time()
+    logger.debug(f"Removed {duplicate_hits} Duplicate Hits")
+    logger.debug(f"Deduplication Time: {end - start:.3f} seconds")
+
     return hits, entries
 
 

From 24676f95d8d9bf1bd5526aa44f60827ee74dc6fe Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 25 Dec 2022 21:45:40 -0300
Subject: [PATCH 5/7] Fix comments, use minimal test case, regenerate test
 index, merge debug logs

- Remove property drawer from test entry for max_words splitting test
  - Property drawer is not required for the test
  - Keep minimal test case to reduce chance for confusion
---
 src/search_type/text_search.py | 8 ++++----
 tests/test_org_to_jsonl.py     | 5 +----
 tests/test_text_search.py      | 4 ++--
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py
index bc3725d9..5bbbdd64 100644
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -150,16 +150,16 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False):
     end = time.time()
     logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")
 
-    # Deduplicate entries by raw entry text
-    # Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries
+    # Deduplicate entries by raw entry text before showing to users
+    # Compiled entries are split by max tokens supported by ML models.
+    # This can result in duplicate hits, entries shown to user.
     start = time.time()
     seen, original_hits_count = set(), len(hits)
     hits = [hit for hit in hits
             if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)]
     duplicate_hits = original_hits_count - len(hits)
     end = time.time()
-    logger.debug(f"Removed {duplicate_hits} Duplicate Hits")
-    logger.debug(f"Deduplication Time: {end - start:.3f} seconds")
+    logger.debug(f"Deduplication Time: {end - start:.3f} seconds. Removed {duplicate_hits} duplicates")
 
     return hits, entries
 
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index ee9aae16..fe64cc67 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -40,9 +40,6 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
     "Ensure entries with compiled words exceeding max_words are split."
     # Arrange
     entry = f'''*** Heading
-    :PROPERTIES:
-    :ID:       42-42-42
-    :END:
     \t\r
     Body Line 1
     '''
@@ -52,7 +49,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
     # Extract Entries from specified Org files
     entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
 
-    # Split Each Entry from specified Org files by Max Words
+    # Split each entry from specified Org files by max words
     jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
         TextToJsonl.split_entries_by_max_tokens(
             OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 528b2f31..dcacf7fb 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -83,7 +83,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
 # ----------------------------------------------------------------------------------------------------
 def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig):
     # Arrange
-    initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
+    initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
 
     assert len(initial_notes_model.entries) == 10
     assert len(initial_notes_model.corpus_embeddings) == 10
@@ -91,7 +91,7 @@ def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_conf
     file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org"
     content_config.org.input_files = [f'{file_to_add_on_reload}']
 
-    # Append Org-Mode Entry with size exceeding max token limit to new Org File in Config
+    # Insert org-mode entry with size exceeding max token limit to new org file
     max_tokens = 256
     with open(file_to_add_on_reload, "w") as f:
         f.write(f"* Entry more than {max_tokens} words\n")

From f209e30a3b320f3a0fd616b7cb3fffe2e2cd0847 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 26 Dec 2022 13:14:15 -0300
Subject: [PATCH 6/7] Split entries by max tokens while converting Markdown
 entries To JSONL

---
 src/processor/markdown/markdown_to_jsonl.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py
index 5c4d660d..17482de5 100644
--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@@ -35,6 +35,12 @@ class MarkdownToJsonl(TextToJsonl):
         end = time.time()
         logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds")
 
+        # Split entries by max tokens supported by model
+        start = time.time()
+        current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
+        end = time.time()
+        logger.debug(f"Split entries by max token size supported by model: {end - start} seconds")
+
         # Identify, mark and merge any new entries with previous entries
         start = time.time()
         if not previous_entries:

From 17fa123b4ec858b846fa8ab4f0e4dbc84360c884 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 26 Dec 2022 15:14:32 -0300
Subject: [PATCH 7/7] Split entries by max tokens while converting Beancount
 entries To JSONL

---
 src/processor/ledger/beancount_to_jsonl.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index ccad97da..9f37df70 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -35,6 +35,12 @@ class BeancountToJsonl(TextToJsonl):
         end = time.time()
         logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds")
 
+        # Split entries by max tokens supported by model
+        start = time.time()
+        current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
+        end = time.time()
+        logger.debug(f"Split entries by max token size supported by model: {end - start} seconds")
+
         # Identify, mark and merge any new entries with previous entries
         start = time.time()
         if not previous_entries: