Use hashed value to improve deduplication of search results on server

This commit is contained in:
Debanjum Singh Solanky 2024-06-16 16:22:16 +05:30
parent 6814dadd21
commit 2930b57c78

View file

@ -132,11 +132,13 @@ async def query(
def collate_results(hits, dedupe=True): def collate_results(hits, dedupe=True):
hit_ids = set() hit_ids = set()
hit_hashes = set()
for hit in hits: for hit in hits:
if dedupe and hit.corpus_id in hit_ids: if dedupe and (hit.hashed_value in hit_hashes or hit.corpus_id in hit_ids):
continue continue
else: else:
hit_hashes.add(hit.hashed_value)
hit_ids.add(hit.corpus_id) hit_ids.add(hit.corpus_id)
yield SearchResponse.model_validate( yield SearchResponse.model_validate(
{ {