From 7468f6a6ed26215f44b0fd1906d698da301d921c Mon Sep 17 00:00:00 2001 From: Debanjum Date: Fri, 8 Nov 2024 14:49:09 -0800 Subject: [PATCH] Deduplicate online references returned by chat API to clients This will ensure only unique online references are shown in all clients. The duplication issue was exacerbated in research mode as even with different online search queries, you can get previously seen results. This change does a global deduplication across all online results seen across research iterations before returning them in client reponse. --- src/khoj/processor/tools/online_search.py | 22 ++++++++++++++++++++++ src/khoj/routers/api_chat.py | 9 +++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index d2d8c685..2ee6e72c 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -367,3 +367,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic for item in response_json["data"] ] return query, {"organic": parsed_response} + + +def deduplicate_organic_results(online_results: dict) -> dict: + """Deduplicate organic search results based on links across all queries.""" + # Keep track of seen links to filter out duplicates across queries + seen_links = set() + deduplicated_results = {} + + # Process each query's results + for query, results in online_results.items(): + # Filter organic results keeping only first occurrence of each link + filtered_organic = [] + for result in results.get("organic", []): + link = result.get("link") + if link and link not in seen_links: + seen_links.add(link) + filtered_organic.append(result) + + # Update results with deduplicated organic entries + deduplicated_results[query] = {**results, "organic": filtered_organic} + + return deduplicated_results diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index c30f4cf8..648fb8dd 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -28,7 +28,11 @@ from khoj.processor.conversation.prompts import help_message, no_entries_found from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log from khoj.processor.image.generate import text_to_image from khoj.processor.speech.text_to_speech import generate_text_to_speech -from khoj.processor.tools.online_search import read_webpages, search_online +from khoj.processor.tools.online_search import ( + deduplicate_organic_results, + read_webpages, + search_online, +) from khoj.processor.tools.run_code import run_code from khoj.routers.api import extract_references_and_questions from khoj.routers.email import send_query_feedback @@ -1026,12 +1030,13 @@ async def chat( ) ## Send Gathered References + unique_online_results = deduplicate_organic_results(online_results) async for result in send_event( ChatEvent.REFERENCES, { "inferredQueries": inferred_queries, "context": compiled_references, - "onlineContext": online_results, + "onlineContext": unique_online_results, "codeContext": code_results, }, ):