Only keep webpage content requested, even if Jina API gets more data

Jina search API returns content of all webpages in search results.
Previously code wouldn't remove content beyond max_webpages_to_read
limit set. Now, webpage content in organic results aree explicitly
removed beyond the requested max_webpage_to_read limit.

This should align behavior of online results from Jina with other
online search providers. And restrict llm context to a reasonable size
when using Jina for online search.
This commit is contained in:
Debanjum 2024-11-10 13:19:24 -08:00
parent 8ef7892c5e
commit eb492f3025

View file

@ -95,17 +95,21 @@ async def search_online(
response_dict = {subquery: search_result for subquery, search_result in search_results} response_dict = {subquery: search_result for subquery, search_result in search_results}
# Gather distinct web pages from organic results for subqueries without an instant answer. # Gather distinct web pages from organic results for subqueries without an instant answer.
# Content of web pages is directly available when Jina is used for search.
webpages: Dict[str, Dict] = {} webpages: Dict[str, Dict] = {}
for subquery in response_dict: for subquery in response_dict:
if "answerBox" in response_dict[subquery]: if "answerBox" in response_dict[subquery]:
continue continue
for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]: for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
link = organic.get("link") link = organic.get("link")
if link in webpages: if link in webpages and idx < max_webpages_to_read:
webpages[link]["queries"].add(subquery) webpages[link]["queries"].add(subquery)
else: # Content of web pages is directly available when Jina is used for search.
elif idx < max_webpages_to_read:
webpages[link] = {"queries": {subquery}, "content": organic.get("content")} webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
# Only keep webpage content for up to max_webpages_to_read organic results.
if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
organic["content"] = None
response_dict[subquery]["organic"][idx] = organic
# Read, extract relevant info from the retrieved web pages # Read, extract relevant info from the retrieved web pages
if webpages: if webpages: