Keep original formatting in compiled text entry strings

- Explicity split entry string by space during split by max_tokens
- Prevent formatting of compiled entry from being lost
- The formatting itself contains useful information
  No point in dropping the formatting unnecessarily,
  even if (say) the currrent search models don't account for it (yet)
This commit is contained in:
Debanjum Singh Solanky 2023-03-30 12:38:45 +07:00
parent a2ab68a7a2
commit 5673bd5b96
2 changed files with 2 additions and 2 deletions

View file

@ -31,7 +31,7 @@ class TextToJsonl(ABC):
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
chunked_entries: List[Entry] = []
for entry in entries:
compiled_entry_words = entry.compiled.split()
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
for chunk_index in range(0, len(compiled_entry_words), max_tokens):

View file

@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
# Arrange
entry = f"""*** Heading
\t\r
Body Line 1
Body Line
"""
orgfile = create_file(tmp_path, entry)