mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Only keep webpage content requested, even if Jina API gets more data
Jina search API returns content of all webpages in search results. Previously code wouldn't remove content beyond max_webpages_to_read limit set. Now, webpage content in organic results aree explicitly removed beyond the requested max_webpage_to_read limit. This should align behavior of online results from Jina with other online search providers. And restrict llm context to a reasonable size when using Jina for online search.
This commit is contained in:
parent
8ef7892c5e
commit
eb492f3025
1 changed files with 8 additions and 4 deletions
|
@ -95,17 +95,21 @@ async def search_online(
|
||||||
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
||||||
|
|
||||||
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
||||||
# Content of web pages is directly available when Jina is used for search.
|
|
||||||
webpages: Dict[str, Dict] = {}
|
webpages: Dict[str, Dict] = {}
|
||||||
for subquery in response_dict:
|
for subquery in response_dict:
|
||||||
if "answerBox" in response_dict[subquery]:
|
if "answerBox" in response_dict[subquery]:
|
||||||
continue
|
continue
|
||||||
for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]:
|
for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
|
||||||
link = organic.get("link")
|
link = organic.get("link")
|
||||||
if link in webpages:
|
if link in webpages and idx < max_webpages_to_read:
|
||||||
webpages[link]["queries"].add(subquery)
|
webpages[link]["queries"].add(subquery)
|
||||||
else:
|
# Content of web pages is directly available when Jina is used for search.
|
||||||
|
elif idx < max_webpages_to_read:
|
||||||
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
||||||
|
# Only keep webpage content for up to max_webpages_to_read organic results.
|
||||||
|
if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
|
||||||
|
organic["content"] = None
|
||||||
|
response_dict[subquery]["organic"][idx] = organic
|
||||||
|
|
||||||
# Read, extract relevant info from the retrieved web pages
|
# Read, extract relevant info from the retrieved web pages
|
||||||
if webpages:
|
if webpages:
|
||||||
|
|
Loading…
Reference in a new issue