Stable sort new entries when marking entries for update

This commit is contained in:
Debanjum Singh Solanky 2023-07-15 23:58:13 -07:00
parent 7669b85da6
commit 6a0297cc86

View file

@ -78,16 +78,23 @@ class TextToJsonl(ABC):
# All entries that exist in both current and previous sets are kept # All entries that exist in both current and previous sets are kept
existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes) existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
# load new entries in the order in which they are processed for a stable sort
new_entries = [
(current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash])
for entry_hash in new_entry_hashes
]
new_entries_sorted = sorted(new_entries, key=lambda e: e[0])
# Mark new entries with -1 id to flag for later embeddings generation # Mark new entries with -1 id to flag for later embeddings generation
new_entries = [(-1, hash_to_current_entries[entry_hash]) for entry_hash in new_entry_hashes] new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted]
# Set id of existing entries to their previous ids to reuse their existing encoded embeddings # Set id of existing entries to their previous ids to reuse their existing encoded embeddings
existing_entries = [ existing_entries = [
(previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
for entry_hash in existing_entry_hashes for entry_hash in existing_entry_hashes
] ]
existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
entries_with_ids = existing_entries_sorted + new_entries
entries_with_ids = existing_entries_sorted + new_entries_sorted
return entries_with_ids return entries_with_ids