Update drop large words test to ensure newlines considerd word boundary

Prevent regression to #620
2025-02-17 08:04:21 +00:00 · 2024-04-08 13:38:08 +05:30 · 2024-04-08 13:38:08 +05:30 · 9239c2c2ed
commit 9239c2c2ed
parent 67b1178aec
1 changed files with 13 additions and 7 deletions
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@ -68,10 +68,12 @@ def test_entry_split_when_exceeds_max_tokens():
 def test_entry_split_drops_large_words():
    "Ensure entries drops words larger than specified max word length from compiled version."
    # Arrange
-    entry_text = f"""*** Heading
+    entry_text = f"""First Line
-    \t\r
+dog=1\n\r\t
-    Body Line 1
+cat=10
-    """
+car=4
 book=2
 """
    entry = Entry(raw=entry_text, compiled=entry_text)
    # Act
@ -79,9 +81,13 @@ def test_entry_split_drops_large_words():
    processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
    # Assert
-    # (Only) "Heading" dropped from compiled version because its over the set max word limit
+    # Ensure words larger than max word length are dropped
-    assert "Heading" not in processed_entry.compiled
+    # Ensure newline characters are considered as word boundaries for splitting words. See #620
-    assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1
+    words_to_keep = ["First", "Line", "dog=1", "car=4"]
    words_to_drop = ["cat=10", "book=2"]
    assert all([word for word in words_to_keep if word in processed_entry.compiled])
    assert not any([word for word in words_to_drop if word in processed_entry.compiled])
    assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 2
 def test_parse_org_file_into_single_entry_if_small(tmp_path):