Keep original formatting in compiled text entry strings

- Explicity split entry string by space during split by max_tokens - Prevent formatting of compiled entry from being lost - The formatting itself contains useful information No point in dropping the formatting unnecessarily, even if (say) the currrent search models don't account for it (yet)
2024-11-23 23:48:56 +01:00 · 2023-03-30 12:38:45 +07:00 · 2023-03-30 12:38:45 +07:00 · 5673bd5b96
commit 5673bd5b96
parent a2ab68a7a2
2 changed files with 2 additions and 2 deletions
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@ -31,7 +31,7 @@ class TextToJsonl(ABC):
        "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
        chunked_entries: List[Entry] = []
        for entry in entries:
-            compiled_entry_words = entry.compiled.split()
+            compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
            # Drop long words instead of having entry truncated to maintain quality of entry processed by models
            compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
            for chunk_index in range(0, len(compiled_entry_words), max_tokens):
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
    # Arrange
    entry = f"""*** Heading
    \t\r
-    Body Line 1
+    Body Line
    """
    orgfile = create_file(tmp_path, entry)