Keep original formatting in compiled text entry strings

- Explicity split entry string by space during split by max_tokens
- Prevent formatting of compiled entry from being lost
- The formatting itself contains useful information
  No point in dropping the formatting unnecessarily,
  even if (say) the currrent search models don't account for it (yet)
This commit is contained in:
Debanjum Singh Solanky 2023-03-30 12:38:45 +07:00
parent a2ab68a7a2
commit 5673bd5b96
2 changed files with 2 additions and 2 deletions

View file

@ -31,7 +31,7 @@ class TextToJsonl(ABC):
"Split entries if compiled entry length exceeds the max tokens supported by the ML model." "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
chunked_entries: List[Entry] = [] chunked_entries: List[Entry] = []
for entry in entries: for entry in entries:
compiled_entry_words = entry.compiled.split() compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
# Drop long words instead of having entry truncated to maintain quality of entry processed by models # Drop long words instead of having entry truncated to maintain quality of entry processed by models
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
for chunk_index in range(0, len(compiled_entry_words), max_tokens): for chunk_index in range(0, len(compiled_entry_words), max_tokens):

View file

@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
# Arrange # Arrange
entry = f"""*** Heading entry = f"""*** Heading
\t\r \t\r
Body Line 1 Body Line
""" """
orgfile = create_file(tmp_path, entry) orgfile = create_file(tmp_path, entry)