diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 52441a99..4a4bd598 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -43,6 +43,11 @@ class OrgToJsonl(TextToJsonl): end = time.time() logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") + start = time.time() + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + end = time.time() + logger.debug(f"Split entries by max token size supported by model: {end - start} seconds") + # Identify, mark and merge any new entries with previous entries if not previous_entries: entries_with_ids = list(enumerate(current_entries)) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index e05831a1..1a1d65d9 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -80,6 +80,39 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC assert "git clone" in search_result +# ---------------------------------------------------------------------------------------------------- +def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig): + # Arrange + initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) + + assert len(initial_notes_model.entries) == 10 + assert len(initial_notes_model.corpus_embeddings) == 10 + + file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org" + content_config.org.input_files = [f'{file_to_add_on_reload}'] + + # Append Org-Mode Entry with size exceeding max token limit to new Org File in Config + max_tokens = 256 + with open(file_to_add_on_reload, "w") as f: + f.write(f"* Entry more than {max_tokens} words\n") + for index in range(max_tokens+1): + f.write(f"{index} ") + + # Act + # reload embeddings, entries, notes model after adding new org-mode file + initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) + + # Assert + # verify newly added org-mode entry is split by max tokens + assert len(initial_notes_model.entries) == 12 + assert len(initial_notes_model.corpus_embeddings) == 12 + + # Cleanup + # delete reload test file added + content_config.org.input_files = [] + file_to_add_on_reload.unlink() + + # ---------------------------------------------------------------------------------------------------- def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig): # Arrange