Deduplicate results for user query by raw text before returning results

- Required because entries are now split by the max_word count supported by the ML models - This would now result in potentially duplicate hits, entries being returned to user - Do deduplication after ranking to get the top ranked deduplicated results
2024-11-23 23:48:56 +01:00 · 2022-12-23 19:01:39 -03:00 · 2022-12-23 19:01:39 -03:00 · b283650991
commit b283650991
parent 53cd2e5605
1 changed files with 11 additions and 0 deletions
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@ -150,6 +150,17 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False):
    end = time.time()
    logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")

+    # Deduplicate entries by raw entry text
+    # Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries
+    start = time.time()
+    seen, original_hits_count = set(), len(hits)
+    hits = [hit for hit in hits
+            if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)]
+    duplicate_hits = original_hits_count - len(hits)
+    end = time.time()
+    logger.debug(f"Removed {duplicate_hits} Duplicate Hits")
+    logger.debug(f"Deduplication Time: {end - start:.3f} seconds")
+
    return hits, entries