mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-01-05 11:08:08 +00:00
Split entries by max tokens while converting Org entries To JSONL
- Test usage the entry splitting by max tokens in text search
This commit is contained in:
parent
e057c8e208
commit
c79919bd68
2 changed files with 38 additions and 0 deletions
|
@ -43,6 +43,11 @@ class OrgToJsonl(TextToJsonl):
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
|
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Split entries by max token size supported by model: {end - start} seconds")
|
||||||
|
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
if not previous_entries:
|
if not previous_entries:
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
|
|
|
@ -80,6 +80,39 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
|
||||||
assert "git clone" in search_result
|
assert "git clone" in search_result
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
|
# Arrange
|
||||||
|
initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||||
|
|
||||||
|
assert len(initial_notes_model.entries) == 10
|
||||||
|
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||||
|
|
||||||
|
file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org"
|
||||||
|
content_config.org.input_files = [f'{file_to_add_on_reload}']
|
||||||
|
|
||||||
|
# Append Org-Mode Entry with size exceeding max token limit to new Org File in Config
|
||||||
|
max_tokens = 256
|
||||||
|
with open(file_to_add_on_reload, "w") as f:
|
||||||
|
f.write(f"* Entry more than {max_tokens} words\n")
|
||||||
|
for index in range(max_tokens+1):
|
||||||
|
f.write(f"{index} ")
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# reload embeddings, entries, notes model after adding new org-mode file
|
||||||
|
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
# verify newly added org-mode entry is split by max tokens
|
||||||
|
assert len(initial_notes_model.entries) == 12
|
||||||
|
assert len(initial_notes_model.corpus_embeddings) == 12
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
# delete reload test file added
|
||||||
|
content_config.org.input_files = []
|
||||||
|
file_to_add_on_reload.unlink()
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
|
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
|
|
Loading…
Reference in a new issue