Create chat actor for directly reading webpages based on user message

- Add prompt for the read webpages chat actor to extract, infer webpage links - Make chat actor infer or extract webpage to read directly from user message - Rename previous read_webpage function to more narrow read_webpage_at_url function
2025-02-17 08:04:21 +00:00 · 2024-03-13 15:22:57 +05:30 · 2024-03-13 15:22:57 +05:30 · 6118d1ff57
commit 6118d1ff57
parent e549824fe2
5 changed files with 112 additions and 6 deletions
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@ -380,6 +380,50 @@ Khoj:
 """.strip()
 )
 infer_webpages_to_read = PromptTemplate.from_template(
    """
 You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
 - You will receive the conversation history as context.
 - Add as much context from the previous questions and answers as required to construct the webpage urls.
 - Use multiple web page urls if required to retrieve the relevant information.
 - You have access to the the whole internet to retrieve information.
 Which webpages will you need to read to answer the user's question?
 Provide web page links as a list of strings in a JSON object.
 Current Date: {current_date}
 User's Location: {location}
 Here are some examples:
 History:
 User: I like to use Hacker News to get my tech news.
 AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups.
 Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345
 Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}}
 History:
 User: I'm currently living in New York but I'm thinking about moving to San Francisco.
 AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene.
 Q: What is the climate like in those cities?
 Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}}
 History:
 User: Hey, how is it going?
 AI: Not too bad. How can I help you today?
 Q: What's the latest news on r/worldnews?
 Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}}
 Now it's your turn to share actual webpage urls you'd like to read to answer the user's question.
 History:
 {chat_history}
 Q: {query}
 Khoj:
 """.strip()
 )
 online_search_conversation_subqueries = PromptTemplate.from_template(
    """
 You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@ -2,6 +2,7 @@ import asyncio
 import json
 import logging
 import os
 from collections import defaultdict
 from typing import Dict, Tuple, Union
 import aiohttp
@ -9,7 +10,11 @@ import requests
 from bs4 import BeautifulSoup
 from markdownify import markdownify
-from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
+from khoj.routers.helpers import (
    extract_relevant_info,
    generate_online_subqueries,
    infer_webpage_urls,
 )
 from khoj.utils.helpers import is_none_or_empty, timer
 from khoj.utils.rawconfig import LocationData
@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1
 async def search_online(query: str, conversation_history: dict, location: LocationData):
-    if SERPER_DEV_API_KEY is None:
+    if not online_search_enabled():
        logger.warn("SERPER_DEV_API_KEY is not set")
        return {}
@ -93,10 +98,20 @@ def search_with_google(subquery: str):
    return extracted_search_result
 async def read_webpages(query: str, conversation_history: dict, location: LocationData):
    "Infer web pages to read from the query and extract relevant information from them"
    urls = await infer_webpage_urls(query, conversation_history, location)
    results: Dict[str, Dict[str, str]] = defaultdict(dict)
    for url in urls:
        _, result = await read_webpage_and_extract_content(query, url)
        results[url]["extracted_content"] = result
    return results
 async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]:
    try:
        with timer(f"Reading web page at '{url}' took", logger):
-            content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url)
+            content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
        with timer(f"Extracting relevant information from web page at '{url}' took", logger):
            extracted_info = await extract_relevant_info(subquery, content)
        return subquery, extracted_info
@ -105,7 +120,7 @@ async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str
        return subquery, None
-async def read_webpage(web_url: str) -> str:
+async def read_webpage_at_url(web_url: str) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
    }
@ -129,3 +144,7 @@ async def read_webpage_with_olostep(web_url: str) -> str:
            response.raise_for_status()
            response_json = await response.json()
            return response_json["markdown_content"]
 def online_search_enabled():
    return SERPER_DEV_API_KEY is not None
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel
 from khoj.utils.helpers import (
    ConversationCommand,
    is_none_or_empty,
    is_valid_url,
    log_telemetry,
    mode_descriptions_for_llm,
    timer,
@ -229,6 +230,35 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
        return ConversationCommand.Default
 async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
    """
    Infer webpage links from the given query
    """
    location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
    chat_history = construct_chat_history(conversation_history)
    utc_date = datetime.utcnow().strftime("%Y-%m-%d")
    online_queries_prompt = prompts.infer_webpages_to_read.format(
        current_date=utc_date,
        query=q,
        chat_history=chat_history,
        location=location,
    )
    response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
    # Validate that the response is a non-empty, JSON-serializable list of URLs
    try:
        response = response.strip()
        urls = json.loads(response)
        valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
        if is_none_or_empty(valid_unique_urls):
            raise ValueError(f"Invalid list of urls: {response}")
        return list(valid_unique_urls)
    except Exception:
        raise ValueError(f"Invalid list of urls: {response}")
 async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
    """
    Generate subqueries from the given query
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@ -15,6 +15,7 @@ from os import path
 from pathlib import Path
 from time import perf_counter
 from typing import TYPE_CHECKING, Optional, Union
 from urllib.parse import urlparse
 import torch
 from asgiref.sync import sync_to_async
@ -340,3 +341,12 @@ def in_debug_mode():
    """Check if Khoj is running in debug mode.
    Set KHOJ_DEBUG environment variable to true to enable debug mode."""
    return is_env_var_true("KHOJ_DEBUG")
 def is_valid_url(url: str) -> bool:
    """Check if a string is a valid URL"""
    try:
        result = urlparse(url.strip())
        return all([result.scheme, result.netloc])
    except:
        return False
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@ -7,7 +7,10 @@ import pytest
 from scipy.stats import linregress
 from khoj.processor.embeddings import EmbeddingsModel
-from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
+from khoj.processor.tools.online_search import (
    read_webpage_at_url,
    read_webpage_with_olostep,
 )
 from khoj.utils import helpers
@ -90,7 +93,7 @@ async def test_reading_webpage():
    website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
    # Act
-    response = await read_webpage(website)
+    response = await read_webpage_at_url(website)
    # Assert
    assert (