mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Keep original formatting in compiled text entry strings
- Explicity split entry string by space during split by max_tokens - Prevent formatting of compiled entry from being lost - The formatting itself contains useful information No point in dropping the formatting unnecessarily, even if (say) the currrent search models don't account for it (yet)
This commit is contained in:
parent
a2ab68a7a2
commit
5673bd5b96
2 changed files with 2 additions and 2 deletions
|
@ -31,7 +31,7 @@ class TextToJsonl(ABC):
|
|||
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
||||
chunked_entries: List[Entry] = []
|
||||
for entry in entries:
|
||||
compiled_entry_words = entry.compiled.split()
|
||||
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
|
||||
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
||||
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
||||
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
||||
|
|
|
@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||
# Arrange
|
||||
entry = f"""*** Heading
|
||||
\t\r
|
||||
Body Line 1
|
||||
Body Line
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
|
|
Loading…
Reference in a new issue