diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index a4256525..5fa75bca 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -380,6 +380,50 @@ Khoj: """.strip() ) +infer_webpages_to_read = PromptTemplate.from_template( + """ +You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question. +- You will receive the conversation history as context. +- Add as much context from the previous questions and answers as required to construct the webpage urls. +- Use multiple web page urls if required to retrieve the relevant information. +- You have access to the the whole internet to retrieve information. + +Which webpages will you need to read to answer the user's question? +Provide web page links as a list of strings in a JSON object. +Current Date: {current_date} +User's Location: {location} + +Here are some examples: +History: +User: I like to use Hacker News to get my tech news. +AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups. + +Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345 +Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}} + +History: +User: I'm currently living in New York but I'm thinking about moving to San Francisco. +AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene. + +Q: What is the climate like in those cities? +Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}} + +History: +User: Hey, how is it going? +AI: Not too bad. How can I help you today? + +Q: What's the latest news on r/worldnews? +Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}} + +Now it's your turn to share actual webpage urls you'd like to read to answer the user's question. +History: +{chat_history} + +Q: {query} +Khoj: +""".strip() +) + online_search_conversation_subqueries = PromptTemplate.from_template( """ You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question. diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 597f394e..84ca7bac 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -2,6 +2,7 @@ import asyncio import json import logging import os +from collections import defaultdict from typing import Dict, Tuple, Union import aiohttp @@ -9,7 +10,11 @@ import requests from bs4 import BeautifulSoup from markdownify import markdownify -from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries +from khoj.routers.helpers import ( + extract_relevant_info, + generate_online_subqueries, + infer_webpage_urls, +) from khoj.utils.helpers import is_none_or_empty, timer from khoj.utils.rawconfig import LocationData @@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1 async def search_online(query: str, conversation_history: dict, location: LocationData): - if SERPER_DEV_API_KEY is None: + if not online_search_enabled(): logger.warn("SERPER_DEV_API_KEY is not set") return {} @@ -93,10 +98,20 @@ def search_with_google(subquery: str): return extracted_search_result +async def read_webpages(query: str, conversation_history: dict, location: LocationData): + "Infer web pages to read from the query and extract relevant information from them" + urls = await infer_webpage_urls(query, conversation_history, location) + results: Dict[str, Dict[str, str]] = defaultdict(dict) + for url in urls: + _, result = await read_webpage_and_extract_content(query, url) + results[url]["extracted_content"] = result + return results + + async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: try: with timer(f"Reading web page at '{url}' took", logger): - content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url) + content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url) with timer(f"Extracting relevant information from web page at '{url}' took", logger): extracted_info = await extract_relevant_info(subquery, content) return subquery, extracted_info @@ -105,7 +120,7 @@ async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str return subquery, None -async def read_webpage(web_url: str) -> str: +async def read_webpage_at_url(web_url: str) -> str: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", } @@ -129,3 +144,7 @@ async def read_webpage_with_olostep(web_url: str) -> str: response.raise_for_status() response_json = await response.json() return response_json["markdown_content"] + + +def online_search_enabled(): + return SERPER_DEV_API_KEY is not None diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 724d640a..a9dd5fb3 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel from khoj.utils.helpers import ( ConversationCommand, is_none_or_empty, + is_valid_url, log_telemetry, mode_descriptions_for_llm, timer, @@ -229,6 +230,35 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict): return ConversationCommand.Default +async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: + """ + Infer webpage links from the given query + """ + location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown" + chat_history = construct_chat_history(conversation_history) + + utc_date = datetime.utcnow().strftime("%Y-%m-%d") + online_queries_prompt = prompts.infer_webpages_to_read.format( + current_date=utc_date, + query=q, + chat_history=chat_history, + location=location, + ) + + response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object") + + # Validate that the response is a non-empty, JSON-serializable list of URLs + try: + response = response.strip() + urls = json.loads(response) + valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)} + if is_none_or_empty(valid_unique_urls): + raise ValueError(f"Invalid list of urls: {response}") + return list(valid_unique_urls) + except Exception: + raise ValueError(f"Invalid list of urls: {response}") + + async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: """ Generate subqueries from the given query diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 150398ee..4023722e 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -15,6 +15,7 @@ from os import path from pathlib import Path from time import perf_counter from typing import TYPE_CHECKING, Optional, Union +from urllib.parse import urlparse import torch from asgiref.sync import sync_to_async @@ -340,3 +341,12 @@ def in_debug_mode(): """Check if Khoj is running in debug mode. Set KHOJ_DEBUG environment variable to true to enable debug mode.""" return is_env_var_true("KHOJ_DEBUG") + + +def is_valid_url(url: str) -> bool: + """Check if a string is a valid URL""" + try: + result = urlparse(url.strip()) + return all([result.scheme, result.netloc]) + except: + return False diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 086e4895..131c3553 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -7,7 +7,10 @@ import pytest from scipy.stats import linregress from khoj.processor.embeddings import EmbeddingsModel -from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep +from khoj.processor.tools.online_search import ( + read_webpage_at_url, + read_webpage_with_olostep, +) from khoj.utils import helpers @@ -90,7 +93,7 @@ async def test_reading_webpage(): website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire" # Act - response = await read_webpage(website) + response = await read_webpage_at_url(website) # Assert assert (