From 5673bd5b9664207c6d83d030a6119d5863b5783c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 30 Mar 2023 12:38:45 +0700 Subject: [PATCH] Keep original formatting in compiled text entry strings - Explicity split entry string by space during split by max_tokens - Prevent formatting of compiled entry from being lost - The formatting itself contains useful information No point in dropping the formatting unnecessarily, even if (say) the currrent search models don't account for it (yet) --- src/khoj/processor/text_to_jsonl.py | 2 +- tests/test_org_to_jsonl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index 570c22bb..22de2c01 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -31,7 +31,7 @@ class TextToJsonl(ABC): "Split entries if compiled entry length exceeds the max tokens supported by the ML model." chunked_entries: List[Entry] = [] for entry in entries: - compiled_entry_words = entry.compiled.split() + compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""] # Drop long words instead of having entry truncated to maintain quality of entry processed by models compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] for chunk_index in range(0, len(compiled_entry_words), max_tokens): diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index b8803772..aed4983f 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Arrange entry = f"""*** Heading \t\r - Body Line 1 + Body Line """ orgfile = create_file(tmp_path, entry)