diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index d2d8c685..2ee6e72c 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -367,3 +367,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic for item in response_json["data"] ] return query, {"organic": parsed_response} + + +def deduplicate_organic_results(online_results: dict) -> dict: + """Deduplicate organic search results based on links across all queries.""" + # Keep track of seen links to filter out duplicates across queries + seen_links = set() + deduplicated_results = {} + + # Process each query's results + for query, results in online_results.items(): + # Filter organic results keeping only first occurrence of each link + filtered_organic = [] + for result in results.get("organic", []): + link = result.get("link") + if link and link not in seen_links: + seen_links.add(link) + filtered_organic.append(result) + + # Update results with deduplicated organic entries + deduplicated_results[query] = {**results, "organic": filtered_organic} + + return deduplicated_results diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index c30f4cf8..648fb8dd 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -28,7 +28,11 @@ from khoj.processor.conversation.prompts import help_message, no_entries_found from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log from khoj.processor.image.generate import text_to_image from khoj.processor.speech.text_to_speech import generate_text_to_speech -from khoj.processor.tools.online_search import read_webpages, search_online +from khoj.processor.tools.online_search import ( + deduplicate_organic_results, + read_webpages, + search_online, +) from khoj.processor.tools.run_code import run_code from khoj.routers.api import extract_references_and_questions from khoj.routers.email import send_query_feedback @@ -1026,12 +1030,13 @@ async def chat( ) ## Send Gathered References + unique_online_results = deduplicate_organic_results(online_results) async for result in send_event( ChatEvent.REFERENCES, { "inferredQueries": inferred_queries, "context": compiled_references, - "onlineContext": online_results, + "onlineContext": unique_online_results, "codeContext": code_results, }, ):