diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index 570c22bb..22de2c01 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -31,7 +31,7 @@ class TextToJsonl(ABC): "Split entries if compiled entry length exceeds the max tokens supported by the ML model." chunked_entries: List[Entry] = [] for entry in entries: - compiled_entry_words = entry.compiled.split() + compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""] # Drop long words instead of having entry truncated to maintain quality of entry processed by models compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] for chunk_index in range(0, len(compiled_entry_words), max_tokens): diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index b8803772..aed4983f 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Arrange entry = f"""*** Heading \t\r - Body Line 1 + Body Line """ orgfile = create_file(tmp_path, entry)