Split entries by max tokens while converting Org entries To JSONL

- Test usage the entry splitting by max tokens in text search
2024-11-23 23:48:56 +01:00 · 2022-12-23 15:52:02 -03:00 · 2022-12-23 15:52:02 -03:00 · c79919bd68
commit c79919bd68
parent e057c8e208
2 changed files with 38 additions and 0 deletions
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@ -43,6 +43,11 @@ class OrgToJsonl(TextToJsonl):
        end = time.time()
        logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")

+        start = time.time()
+        current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
+        end = time.time()
+        logger.debug(f"Split entries by max token size supported by model: {end - start} seconds")
+
        # Identify, mark and merge any new entries with previous entries
        if not previous_entries:
            entries_with_ids = list(enumerate(current_entries))
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@ -80,6 +80,39 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
    assert "git clone" in search_result


+# ----------------------------------------------------------------------------------------------------
+def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig):
+    # Arrange
+    initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
+
+    assert len(initial_notes_model.entries) == 10
+    assert len(initial_notes_model.corpus_embeddings) == 10
+
+    file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org"
+    content_config.org.input_files = [f'{file_to_add_on_reload}']
+
+    # Append Org-Mode Entry with size exceeding max token limit to new Org File in Config
+    max_tokens = 256
+    with open(file_to_add_on_reload, "w") as f:
+        f.write(f"* Entry more than {max_tokens} words\n")
+        for index in range(max_tokens+1):
+            f.write(f"{index} ")
+
+    # Act
+    # reload embeddings, entries, notes model after adding new org-mode file
+    initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
+
+    # Assert
+    # verify newly added org-mode entry is split by max tokens
+    assert len(initial_notes_model.entries) == 12
+    assert len(initial_notes_model.corpus_embeddings) == 12
+
+    # Cleanup
+    # delete reload test file added
+    content_config.org.input_files = []
+    file_to_add_on_reload.unlink()
+
+
 # ----------------------------------------------------------------------------------------------------
 def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
    # Arrange