mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Update drop large words test to ensure newlines considerd word boundary
Prevent regression to #620
This commit is contained in:
parent
67b1178aec
commit
9239c2c2ed
1 changed files with 13 additions and 7 deletions
|
@ -68,10 +68,12 @@ def test_entry_split_when_exceeds_max_tokens():
|
||||||
def test_entry_split_drops_large_words():
|
def test_entry_split_drops_large_words():
|
||||||
"Ensure entries drops words larger than specified max word length from compiled version."
|
"Ensure entries drops words larger than specified max word length from compiled version."
|
||||||
# Arrange
|
# Arrange
|
||||||
entry_text = f"""*** Heading
|
entry_text = f"""First Line
|
||||||
\t\r
|
dog=1\n\r\t
|
||||||
Body Line 1
|
cat=10
|
||||||
"""
|
car=4
|
||||||
|
book=2
|
||||||
|
"""
|
||||||
entry = Entry(raw=entry_text, compiled=entry_text)
|
entry = Entry(raw=entry_text, compiled=entry_text)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -79,9 +81,13 @@ def test_entry_split_drops_large_words():
|
||||||
processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
# (Only) "Heading" dropped from compiled version because its over the set max word limit
|
# Ensure words larger than max word length are dropped
|
||||||
assert "Heading" not in processed_entry.compiled
|
# Ensure newline characters are considered as word boundaries for splitting words. See #620
|
||||||
assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1
|
words_to_keep = ["First", "Line", "dog=1", "car=4"]
|
||||||
|
words_to_drop = ["cat=10", "book=2"]
|
||||||
|
assert all([word for word in words_to_keep if word in processed_entry.compiled])
|
||||||
|
assert not any([word for word in words_to_drop if word in processed_entry.compiled])
|
||||||
|
assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 2
|
||||||
|
|
||||||
|
|
||||||
def test_parse_org_file_into_single_entry_if_small(tmp_path):
|
def test_parse_org_file_into_single_entry_if_small(tmp_path):
|
||||||
|
|
Loading…
Add table
Reference in a new issue