mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Fix comments, use minimal test case, regenerate test index, merge debug logs
- Remove property drawer from test entry for max_words splitting test - Property drawer is not required for the test - Keep minimal test case to reduce chance for confusion
This commit is contained in:
parent
b283650991
commit
24676f95d8
3 changed files with 7 additions and 10 deletions
|
@ -150,16 +150,16 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False):
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")
|
logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")
|
||||||
|
|
||||||
# Deduplicate entries by raw entry text
|
# Deduplicate entries by raw entry text before showing to users
|
||||||
# Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries
|
# Compiled entries are split by max tokens supported by ML models.
|
||||||
|
# This can result in duplicate hits, entries shown to user.
|
||||||
start = time.time()
|
start = time.time()
|
||||||
seen, original_hits_count = set(), len(hits)
|
seen, original_hits_count = set(), len(hits)
|
||||||
hits = [hit for hit in hits
|
hits = [hit for hit in hits
|
||||||
if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)]
|
if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)]
|
||||||
duplicate_hits = original_hits_count - len(hits)
|
duplicate_hits = original_hits_count - len(hits)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Removed {duplicate_hits} Duplicate Hits")
|
logger.debug(f"Deduplication Time: {end - start:.3f} seconds. Removed {duplicate_hits} duplicates")
|
||||||
logger.debug(f"Deduplication Time: {end - start:.3f} seconds")
|
|
||||||
|
|
||||||
return hits, entries
|
return hits, entries
|
||||||
|
|
||||||
|
|
|
@ -40,9 +40,6 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||||
"Ensure entries with compiled words exceeding max_words are split."
|
"Ensure entries with compiled words exceeding max_words are split."
|
||||||
# Arrange
|
# Arrange
|
||||||
entry = f'''*** Heading
|
entry = f'''*** Heading
|
||||||
:PROPERTIES:
|
|
||||||
:ID: 42-42-42
|
|
||||||
:END:
|
|
||||||
\t\r
|
\t\r
|
||||||
Body Line 1
|
Body Line 1
|
||||||
'''
|
'''
|
||||||
|
@ -52,7 +49,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
# Split Each Entry from specified Org files by Max Words
|
# Split each entry from specified Org files by max words
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||||
TextToJsonl.split_entries_by_max_tokens(
|
TextToJsonl.split_entries_by_max_tokens(
|
||||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
|
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
|
||||||
|
|
|
@ -83,7 +83,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig):
|
def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||||
|
|
||||||
assert len(initial_notes_model.entries) == 10
|
assert len(initial_notes_model.entries) == 10
|
||||||
assert len(initial_notes_model.corpus_embeddings) == 10
|
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||||
|
@ -91,7 +91,7 @@ def test_entry_chunking_by_max_tokens(content_config: ContentConfig, search_conf
|
||||||
file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org"
|
file_to_add_on_reload = Path(content_config.org.input_filter[0]).parent / "entry_exceeding_max_tokens.org"
|
||||||
content_config.org.input_files = [f'{file_to_add_on_reload}']
|
content_config.org.input_files = [f'{file_to_add_on_reload}']
|
||||||
|
|
||||||
# Append Org-Mode Entry with size exceeding max token limit to new Org File in Config
|
# Insert org-mode entry with size exceeding max token limit to new org file
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
with open(file_to_add_on_reload, "w") as f:
|
with open(file_to_add_on_reload, "w") as f:
|
||||||
f.write(f"* Entry more than {max_tokens} words\n")
|
f.write(f"* Entry more than {max_tokens} words\n")
|
||||||
|
|
Loading…
Reference in a new issue