Only keep webpage content requested, even if Jina API gets more data

Jina search API returns content of all webpages in search results. Previously code wouldn't remove content beyond max_webpages_to_read limit set. Now, webpage content in organic results aree explicitly removed beyond the requested max_webpage_to_read limit. This should align behavior of online results from Jina with other online search providers. And restrict llm context to a reasonable size when using Jina for online search.
2024-11-23 15:38:55 +01:00 · 2024-11-10 13:19:24 -08:00 · 2024-11-10 13:19:24 -08:00 · eb492f3025
commit eb492f3025
parent 8ef7892c5e
1 changed files with 8 additions and 4 deletions
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@ -95,17 +95,21 @@ async def search_online(
        response_dict = {subquery: search_result for subquery, search_result in search_results}

    # Gather distinct web pages from organic results for subqueries without an instant answer.
-    # Content of web pages is directly available when Jina is used for search.
    webpages: Dict[str, Dict] = {}
    for subquery in response_dict:
        if "answerBox" in response_dict[subquery]:
            continue
-        for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]:
+        for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
            link = organic.get("link")
-            if link in webpages:
+            if link in webpages and idx < max_webpages_to_read:
                webpages[link]["queries"].add(subquery)
-            else:
+            # Content of web pages is directly available when Jina is used for search.
+            elif idx < max_webpages_to_read:
                webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
+            # Only keep webpage content for up to max_webpages_to_read organic results.
+            if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
+                organic["content"] = None
+                response_dict[subquery]["organic"][idx] = organic

    # Read, extract relevant info from the retrieved web pages
    if webpages: