mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-28 01:45:07 +01:00
Drop long words from compiled entries to be within max token limit of models
Long words (>500 characters) provide less useful context to models. Dropping very long words allow models to create better embeddings by passing more of the useful context from the entry to the model
This commit is contained in:
parent
6a30a13326
commit
826f9dc054
2 changed files with 22 additions and 1 deletions
|
@ -24,11 +24,13 @@ class TextToJsonl(ABC):
|
||||||
return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding='utf-8')).hexdigest()
|
return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding='utf-8')).hexdigest()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256) -> list[Entry]:
|
def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256, max_word_length: int=500) -> list[Entry]:
|
||||||
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
||||||
chunked_entries: list[Entry] = []
|
chunked_entries: list[Entry] = []
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
compiled_entry_words = entry.compiled.split()
|
compiled_entry_words = entry.compiled.split()
|
||||||
|
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
||||||
|
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
||||||
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
||||||
compiled_entry_words_chunk = compiled_entry_words[chunk_index:chunk_index + max_tokens]
|
compiled_entry_words_chunk = compiled_entry_words[chunk_index:chunk_index + max_tokens]
|
||||||
compiled_entry_chunk = ' '.join(compiled_entry_words_chunk)
|
compiled_entry_chunk = ' '.join(compiled_entry_words_chunk)
|
||||||
|
|
|
@ -5,6 +5,7 @@ import json
|
||||||
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from src.processor.text_to_jsonl import TextToJsonl
|
from src.processor.text_to_jsonl import TextToJsonl
|
||||||
from src.utils.helpers import is_none_or_empty
|
from src.utils.helpers import is_none_or_empty
|
||||||
|
from src.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
def test_configure_heading_entry_to_jsonl(tmp_path):
|
def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||||
|
@ -61,6 +62,24 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||||
assert len(jsonl_data) == 2
|
assert len(jsonl_data) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_entry_split_drops_large_words(tmp_path):
|
||||||
|
"Ensure entries drops words larger than specified max word length from compiled version."
|
||||||
|
# Arrange
|
||||||
|
entry_text = f'''*** Heading
|
||||||
|
\t\r
|
||||||
|
Body Line 1
|
||||||
|
'''
|
||||||
|
entry = Entry(raw=entry_text, compiled=entry_text)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Split entry by max words and drop words larger than max word length
|
||||||
|
processed_entry = TextToJsonl.split_entries_by_max_tokens([entry], max_word_length = 5)[0]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
# "Heading" dropped from compiled version because its over the set max word limit
|
||||||
|
assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1
|
||||||
|
|
||||||
|
|
||||||
def test_entry_with_body_to_jsonl(tmp_path):
|
def test_entry_with_body_to_jsonl(tmp_path):
|
||||||
"Ensure entries with valid body text are loaded."
|
"Ensure entries with valid body text are loaded."
|
||||||
# Arrange
|
# Arrange
|
||||||
|
|
Loading…
Reference in a new issue