mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Only keep webpage content requested, even if Jina API gets more data
Jina search API returns content of all webpages in search results. Previously code wouldn't remove content beyond max_webpages_to_read limit set. Now, webpage content in organic results aree explicitly removed beyond the requested max_webpage_to_read limit. This should align behavior of online results from Jina with other online search providers. And restrict llm context to a reasonable size when using Jina for online search.
This commit is contained in:
parent
8ef7892c5e
commit
eb492f3025
1 changed files with 8 additions and 4 deletions
|
@ -95,17 +95,21 @@ async def search_online(
|
|||
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
||||
|
||||
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
||||
# Content of web pages is directly available when Jina is used for search.
|
||||
webpages: Dict[str, Dict] = {}
|
||||
for subquery in response_dict:
|
||||
if "answerBox" in response_dict[subquery]:
|
||||
continue
|
||||
for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]:
|
||||
for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
|
||||
link = organic.get("link")
|
||||
if link in webpages:
|
||||
if link in webpages and idx < max_webpages_to_read:
|
||||
webpages[link]["queries"].add(subquery)
|
||||
else:
|
||||
# Content of web pages is directly available when Jina is used for search.
|
||||
elif idx < max_webpages_to_read:
|
||||
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
||||
# Only keep webpage content for up to max_webpages_to_read organic results.
|
||||
if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
|
||||
organic["content"] = None
|
||||
response_dict[subquery]["organic"][idx] = organic
|
||||
|
||||
# Read, extract relevant info from the retrieved web pages
|
||||
if webpages:
|
||||
|
|
Loading…
Reference in a new issue