mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Keep original formatting in compiled text entry strings
- Explicity split entry string by space during split by max_tokens - Prevent formatting of compiled entry from being lost - The formatting itself contains useful information No point in dropping the formatting unnecessarily, even if (say) the currrent search models don't account for it (yet)
This commit is contained in:
parent
a2ab68a7a2
commit
5673bd5b96
2 changed files with 2 additions and 2 deletions
|
@ -31,7 +31,7 @@ class TextToJsonl(ABC):
|
||||||
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
||||||
chunked_entries: List[Entry] = []
|
chunked_entries: List[Entry] = []
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
compiled_entry_words = entry.compiled.split()
|
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
|
||||||
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
||||||
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
||||||
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
||||||
|
|
|
@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||||
# Arrange
|
# Arrange
|
||||||
entry = f"""*** Heading
|
entry = f"""*** Heading
|
||||||
\t\r
|
\t\r
|
||||||
Body Line 1
|
Body Line
|
||||||
"""
|
"""
|
||||||
orgfile = create_file(tmp_path, entry)
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue