mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-12-03 12:23:02 +01:00
Deduplicate online references returned by chat API to clients
This will ensure only unique online references are shown in all clients. The duplication issue was exacerbated in research mode as even with different online search queries, you can get previously seen results. This change does a global deduplication across all online results seen across research iterations before returning them in client reponse.
This commit is contained in:
parent
137687ee49
commit
7468f6a6ed
2 changed files with 29 additions and 2 deletions
|
@ -367,3 +367,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
|
||||||
for item in response_json["data"]
|
for item in response_json["data"]
|
||||||
]
|
]
|
||||||
return query, {"organic": parsed_response}
|
return query, {"organic": parsed_response}
|
||||||
|
|
||||||
|
|
||||||
|
def deduplicate_organic_results(online_results: dict) -> dict:
|
||||||
|
"""Deduplicate organic search results based on links across all queries."""
|
||||||
|
# Keep track of seen links to filter out duplicates across queries
|
||||||
|
seen_links = set()
|
||||||
|
deduplicated_results = {}
|
||||||
|
|
||||||
|
# Process each query's results
|
||||||
|
for query, results in online_results.items():
|
||||||
|
# Filter organic results keeping only first occurrence of each link
|
||||||
|
filtered_organic = []
|
||||||
|
for result in results.get("organic", []):
|
||||||
|
link = result.get("link")
|
||||||
|
if link and link not in seen_links:
|
||||||
|
seen_links.add(link)
|
||||||
|
filtered_organic.append(result)
|
||||||
|
|
||||||
|
# Update results with deduplicated organic entries
|
||||||
|
deduplicated_results[query] = {**results, "organic": filtered_organic}
|
||||||
|
|
||||||
|
return deduplicated_results
|
||||||
|
|
|
@ -28,7 +28,11 @@ from khoj.processor.conversation.prompts import help_message, no_entries_found
|
||||||
from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log
|
from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log
|
||||||
from khoj.processor.image.generate import text_to_image
|
from khoj.processor.image.generate import text_to_image
|
||||||
from khoj.processor.speech.text_to_speech import generate_text_to_speech
|
from khoj.processor.speech.text_to_speech import generate_text_to_speech
|
||||||
from khoj.processor.tools.online_search import read_webpages, search_online
|
from khoj.processor.tools.online_search import (
|
||||||
|
deduplicate_organic_results,
|
||||||
|
read_webpages,
|
||||||
|
search_online,
|
||||||
|
)
|
||||||
from khoj.processor.tools.run_code import run_code
|
from khoj.processor.tools.run_code import run_code
|
||||||
from khoj.routers.api import extract_references_and_questions
|
from khoj.routers.api import extract_references_and_questions
|
||||||
from khoj.routers.email import send_query_feedback
|
from khoj.routers.email import send_query_feedback
|
||||||
|
@ -1026,12 +1030,13 @@ async def chat(
|
||||||
)
|
)
|
||||||
|
|
||||||
## Send Gathered References
|
## Send Gathered References
|
||||||
|
unique_online_results = deduplicate_organic_results(online_results)
|
||||||
async for result in send_event(
|
async for result in send_event(
|
||||||
ChatEvent.REFERENCES,
|
ChatEvent.REFERENCES,
|
||||||
{
|
{
|
||||||
"inferredQueries": inferred_queries,
|
"inferredQueries": inferred_queries,
|
||||||
"context": compiled_references,
|
"context": compiled_references,
|
||||||
"onlineContext": online_results,
|
"onlineContext": unique_online_results,
|
||||||
"codeContext": code_results,
|
"codeContext": code_results,
|
||||||
},
|
},
|
||||||
):
|
):
|
||||||
|
|
Loading…
Reference in a new issue