Create chat actor for directly reading webpages based on user message

- Add prompt for the read webpages chat actor to extract, infer webpage links - Make chat actor infer or extract webpage to read directly from user message - Rename previous read_webpage function to more narrow read_webpage_at_url function
2024-11-23 15:38:55 +01:00 · 2024-03-13 15:22:57 +05:30 · 2024-03-13 15:22:57 +05:30 · 6118d1ff57
commit 6118d1ff57
parent e549824fe2
5 changed files with 112 additions and 6 deletions
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@ -380,6 +380,50 @@ Khoj:
 """.strip()
 )

+infer_webpages_to_read = PromptTemplate.from_template(
+    """
+You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
+- You will receive the conversation history as context.
+- Add as much context from the previous questions and answers as required to construct the webpage urls.
+- Use multiple web page urls if required to retrieve the relevant information.
+- You have access to the the whole internet to retrieve information.
+
+Which webpages will you need to read to answer the user's question?
+Provide web page links as a list of strings in a JSON object.
+Current Date: {current_date}
+User's Location: {location}
+
+Here are some examples:
+History:
+User: I like to use Hacker News to get my tech news.
+AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups.
+
+Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345
+Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}}
+
+History:
+User: I'm currently living in New York but I'm thinking about moving to San Francisco.
+AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene.
+
+Q: What is the climate like in those cities?
+Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}}
+
+History:
+User: Hey, how is it going?
+AI: Not too bad. How can I help you today?
+
+Q: What's the latest news on r/worldnews?
+Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}}
+
+Now it's your turn to share actual webpage urls you'd like to read to answer the user's question.
+History:
+{chat_history}
+
+Q: {query}
+Khoj:
+""".strip()
+)
+
 online_search_conversation_subqueries = PromptTemplate.from_template(
    """
 You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@ -2,6 +2,7 @@ import asyncio
 import json
 import logging
 import os
+from collections import defaultdict
 from typing import Dict, Tuple, Union

 import aiohttp
@ -9,7 +10,11 @@ import requests
 from bs4 import BeautifulSoup
 from markdownify import markdownify

-from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
+from khoj.routers.helpers import (
+    extract_relevant_info,
+    generate_online_subqueries,
+    infer_webpage_urls,
+)
 from khoj.utils.helpers import is_none_or_empty, timer
 from khoj.utils.rawconfig import LocationData

@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1


 async def search_online(query: str, conversation_history: dict, location: LocationData):
-    if SERPER_DEV_API_KEY is None:
+    if not online_search_enabled():
        logger.warn("SERPER_DEV_API_KEY is not set")
        return {}

@ -93,10 +98,20 @@ def search_with_google(subquery: str):
    return extracted_search_result


+async def read_webpages(query: str, conversation_history: dict, location: LocationData):
+    "Infer web pages to read from the query and extract relevant information from them"
+    urls = await infer_webpage_urls(query, conversation_history, location)
+    results: Dict[str, Dict[str, str]] = defaultdict(dict)
+    for url in urls:
+        _, result = await read_webpage_and_extract_content(query, url)
+        results[url]["extracted_content"] = result
+    return results
+
+
 async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]:
    try:
        with timer(f"Reading web page at '{url}' took", logger):
-            content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url)
+            content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
        with timer(f"Extracting relevant information from web page at '{url}' took", logger):
            extracted_info = await extract_relevant_info(subquery, content)
        return subquery, extracted_info
@ -105,7 +120,7 @@ async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str
        return subquery, None


-async def read_webpage(web_url: str) -> str:
+async def read_webpage_at_url(web_url: str) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
    }
@ -129,3 +144,7 @@ async def read_webpage_with_olostep(web_url: str) -> str:
            response.raise_for_status()
            response_json = await response.json()
            return response_json["markdown_content"]
+
+
+def online_search_enabled():
+    return SERPER_DEV_API_KEY is not None
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel
 from khoj.utils.helpers import (
    ConversationCommand,
    is_none_or_empty,
+    is_valid_url,
    log_telemetry,
    mode_descriptions_for_llm,
    timer,
@ -229,6 +230,35 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
        return ConversationCommand.Default


+async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
+    """
+    Infer webpage links from the given query
+    """
+    location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
+    chat_history = construct_chat_history(conversation_history)
+
+    utc_date = datetime.utcnow().strftime("%Y-%m-%d")
+    online_queries_prompt = prompts.infer_webpages_to_read.format(
+        current_date=utc_date,
+        query=q,
+        chat_history=chat_history,
+        location=location,
+    )
+
+    response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
+
+    # Validate that the response is a non-empty, JSON-serializable list of URLs
+    try:
+        response = response.strip()
+        urls = json.loads(response)
+        valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
+        if is_none_or_empty(valid_unique_urls):
+            raise ValueError(f"Invalid list of urls: {response}")
+        return list(valid_unique_urls)
+    except Exception:
+        raise ValueError(f"Invalid list of urls: {response}")
+
+
 async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
    """
    Generate subqueries from the given query
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@ -15,6 +15,7 @@ from os import path
 from pathlib import Path
 from time import perf_counter
 from typing import TYPE_CHECKING, Optional, Union
+from urllib.parse import urlparse

 import torch
 from asgiref.sync import sync_to_async
@ -340,3 +341,12 @@ def in_debug_mode():
    """Check if Khoj is running in debug mode.
    Set KHOJ_DEBUG environment variable to true to enable debug mode."""
    return is_env_var_true("KHOJ_DEBUG")
+
+
+def is_valid_url(url: str) -> bool:
+    """Check if a string is a valid URL"""
+    try:
+        result = urlparse(url.strip())
+        return all([result.scheme, result.netloc])
+    except:
+        return False
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@ -7,7 +7,10 @@ import pytest
 from scipy.stats import linregress

 from khoj.processor.embeddings import EmbeddingsModel
-from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
+from khoj.processor.tools.online_search import (
+    read_webpage_at_url,
+    read_webpage_with_olostep,
+)
 from khoj.utils import helpers


@ -90,7 +93,7 @@ async def test_reading_webpage():
    website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"

    # Act
-    response = await read_webpage(website)
+    response = await read_webpage_at_url(website)

    # Assert
    assert (