Deduplicate online references returned by chat API to clients

This will ensure only unique online references are shown in all
clients.

The duplication issue was exacerbated in research mode as even with
different online search queries, you can get previously seen results.

This change does a global deduplication across all online results seen
across research iterations before returning them in client reponse.
This commit is contained in:
Debanjum 2024-11-08 14:49:09 -08:00
parent 137687ee49
commit 7468f6a6ed
2 changed files with 29 additions and 2 deletions

View file

@ -367,3 +367,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
for item in response_json["data"] for item in response_json["data"]
] ]
return query, {"organic": parsed_response} return query, {"organic": parsed_response}
def deduplicate_organic_results(online_results: dict) -> dict:
"""Deduplicate organic search results based on links across all queries."""
# Keep track of seen links to filter out duplicates across queries
seen_links = set()
deduplicated_results = {}
# Process each query's results
for query, results in online_results.items():
# Filter organic results keeping only first occurrence of each link
filtered_organic = []
for result in results.get("organic", []):
link = result.get("link")
if link and link not in seen_links:
seen_links.add(link)
filtered_organic.append(result)
# Update results with deduplicated organic entries
deduplicated_results[query] = {**results, "organic": filtered_organic}
return deduplicated_results

View file

@ -28,7 +28,11 @@ from khoj.processor.conversation.prompts import help_message, no_entries_found
from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log
from khoj.processor.image.generate import text_to_image from khoj.processor.image.generate import text_to_image
from khoj.processor.speech.text_to_speech import generate_text_to_speech from khoj.processor.speech.text_to_speech import generate_text_to_speech
from khoj.processor.tools.online_search import read_webpages, search_online from khoj.processor.tools.online_search import (
deduplicate_organic_results,
read_webpages,
search_online,
)
from khoj.processor.tools.run_code import run_code from khoj.processor.tools.run_code import run_code
from khoj.routers.api import extract_references_and_questions from khoj.routers.api import extract_references_and_questions
from khoj.routers.email import send_query_feedback from khoj.routers.email import send_query_feedback
@ -1026,12 +1030,13 @@ async def chat(
) )
## Send Gathered References ## Send Gathered References
unique_online_results = deduplicate_organic_results(online_results)
async for result in send_event( async for result in send_event(
ChatEvent.REFERENCES, ChatEvent.REFERENCES,
{ {
"inferredQueries": inferred_queries, "inferredQueries": inferred_queries,
"context": compiled_references, "context": compiled_references,
"onlineContext": online_results, "onlineContext": unique_online_results,
"codeContext": code_results, "codeContext": code_results,
}, },
): ):