mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-28 01:45:07 +01:00
Stable sort new entries when marking entries for update
This commit is contained in:
parent
7669b85da6
commit
6a0297cc86
1 changed files with 10 additions and 3 deletions
|
@ -78,16 +78,23 @@ class TextToJsonl(ABC):
|
||||||
# All entries that exist in both current and previous sets are kept
|
# All entries that exist in both current and previous sets are kept
|
||||||
existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
|
existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
|
||||||
|
|
||||||
|
# load new entries in the order in which they are processed for a stable sort
|
||||||
|
new_entries = [
|
||||||
|
(current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash])
|
||||||
|
for entry_hash in new_entry_hashes
|
||||||
|
]
|
||||||
|
new_entries_sorted = sorted(new_entries, key=lambda e: e[0])
|
||||||
# Mark new entries with -1 id to flag for later embeddings generation
|
# Mark new entries with -1 id to flag for later embeddings generation
|
||||||
new_entries = [(-1, hash_to_current_entries[entry_hash]) for entry_hash in new_entry_hashes]
|
new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted]
|
||||||
|
|
||||||
# Set id of existing entries to their previous ids to reuse their existing encoded embeddings
|
# Set id of existing entries to their previous ids to reuse their existing encoded embeddings
|
||||||
existing_entries = [
|
existing_entries = [
|
||||||
(previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
|
(previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
|
||||||
for entry_hash in existing_entry_hashes
|
for entry_hash in existing_entry_hashes
|
||||||
]
|
]
|
||||||
|
|
||||||
existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
|
existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
|
||||||
entries_with_ids = existing_entries_sorted + new_entries
|
|
||||||
|
entries_with_ids = existing_entries_sorted + new_entries_sorted
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue