diff --git a/src/processor/text_to_jsonl.py b/src/processor/text_to_jsonl.py index 0eb60e6c..4d88a612 100644 --- a/src/processor/text_to_jsonl.py +++ b/src/processor/text_to_jsonl.py @@ -24,11 +24,13 @@ class TextToJsonl(ABC): return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding='utf-8')).hexdigest() @staticmethod - def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256) -> list[Entry]: + def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256, max_word_length: int=500) -> list[Entry]: "Split entries if compiled entry length exceeds the max tokens supported by the ML model." chunked_entries: list[Entry] = [] for entry in entries: compiled_entry_words = entry.compiled.split() + # Drop long words instead of having entry truncated to maintain quality of entry processed by models + compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] for chunk_index in range(0, len(compiled_entry_words), max_tokens): compiled_entry_words_chunk = compiled_entry_words[chunk_index:chunk_index + max_tokens] compiled_entry_chunk = ' '.join(compiled_entry_words_chunk) diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index fe64cc67..3f30b7fc 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -5,6 +5,7 @@ import json from src.processor.org_mode.org_to_jsonl import OrgToJsonl from src.processor.text_to_jsonl import TextToJsonl from src.utils.helpers import is_none_or_empty +from src.utils.rawconfig import Entry def test_configure_heading_entry_to_jsonl(tmp_path): @@ -61,6 +62,24 @@ def test_entry_split_when_exceeds_max_words(tmp_path): assert len(jsonl_data) == 2 +def test_entry_split_drops_large_words(tmp_path): + "Ensure entries drops words larger than specified max word length from compiled version." + # Arrange + entry_text = f'''*** Heading + \t\r + Body Line 1 + ''' + entry = Entry(raw=entry_text, compiled=entry_text) + + # Act + # Split entry by max words and drop words larger than max word length + processed_entry = TextToJsonl.split_entries_by_max_tokens([entry], max_word_length = 5)[0] + + # Assert + # "Heading" dropped from compiled version because its over the set max word limit + assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1 + + def test_entry_with_body_to_jsonl(tmp_path): "Ensure entries with valid body text are loaded." # Arrange