From 5673bd5b9664207c6d83d030a6119d5863b5783c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Thu, 30 Mar 2023 12:38:45 +0700
Subject: [PATCH] Keep original formatting in compiled text entry strings

- Explicity split entry string by space during split by max_tokens
- Prevent formatting of compiled entry from being lost
- The formatting itself contains useful information
  No point in dropping the formatting unnecessarily,
  even if (say) the currrent search models don't account for it (yet)
---
 src/khoj/processor/text_to_jsonl.py | 2 +-
 tests/test_org_to_jsonl.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index 570c22bb..22de2c01 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -31,7 +31,7 @@ class TextToJsonl(ABC):
         "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
         chunked_entries: List[Entry] = []
         for entry in entries:
-            compiled_entry_words = entry.compiled.split()
+            compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
             # Drop long words instead of having entry truncated to maintain quality of entry processed by models
             compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
             for chunk_index in range(0, len(compiled_entry_words), max_tokens):
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index b8803772..aed4983f 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
     # Arrange
     entry = f"""*** Heading
     \t\r
-    Body Line 1
+    Body Line
     """
     orgfile = create_file(tmp_path, entry)