Create chat actor for directly reading webpages based on user message

- Add prompt for the read webpages chat actor to extract, infer
  webpage links
- Make chat actor infer or extract webpage to read directly from user
  message
- Rename previous read_webpage function to more narrow
  read_webpage_at_url function
This commit is contained in:
Debanjum Singh Solanky 2024-03-13 15:22:57 +05:30
parent e549824fe2
commit 6118d1ff57
5 changed files with 112 additions and 6 deletions

View file

@ -380,6 +380,50 @@ Khoj:
""".strip()
)
infer_webpages_to_read = PromptTemplate.from_template(
"""
You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
- You will receive the conversation history as context.
- Add as much context from the previous questions and answers as required to construct the webpage urls.
- Use multiple web page urls if required to retrieve the relevant information.
- You have access to the the whole internet to retrieve information.
Which webpages will you need to read to answer the user's question?
Provide web page links as a list of strings in a JSON object.
Current Date: {current_date}
User's Location: {location}
Here are some examples:
History:
User: I like to use Hacker News to get my tech news.
AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups.
Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345
Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}}
History:
User: I'm currently living in New York but I'm thinking about moving to San Francisco.
AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene.
Q: What is the climate like in those cities?
Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}}
History:
User: Hey, how is it going?
AI: Not too bad. How can I help you today?
Q: What's the latest news on r/worldnews?
Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}}
Now it's your turn to share actual webpage urls you'd like to read to answer the user's question.
History:
{chat_history}
Q: {query}
Khoj:
""".strip()
)
online_search_conversation_subqueries = PromptTemplate.from_template(
"""
You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.

View file

@ -2,6 +2,7 @@ import asyncio
import json
import logging
import os
from collections import defaultdict
from typing import Dict, Tuple, Union
import aiohttp
@ -9,7 +10,11 @@ import requests
from bs4 import BeautifulSoup
from markdownify import markdownify
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
from khoj.routers.helpers import (
extract_relevant_info,
generate_online_subqueries,
infer_webpage_urls,
)
from khoj.utils.helpers import is_none_or_empty, timer
from khoj.utils.rawconfig import LocationData
@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1
async def search_online(query: str, conversation_history: dict, location: LocationData):
if SERPER_DEV_API_KEY is None:
if not online_search_enabled():
logger.warn("SERPER_DEV_API_KEY is not set")
return {}
@ -93,10 +98,20 @@ def search_with_google(subquery: str):
return extracted_search_result
async def read_webpages(query: str, conversation_history: dict, location: LocationData):
"Infer web pages to read from the query and extract relevant information from them"
urls = await infer_webpage_urls(query, conversation_history, location)
results: Dict[str, Dict[str, str]] = defaultdict(dict)
for url in urls:
_, result = await read_webpage_and_extract_content(query, url)
results[url]["extracted_content"] = result
return results
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]:
try:
with timer(f"Reading web page at '{url}' took", logger):
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url)
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subquery, content)
return subquery, extracted_info
@ -105,7 +120,7 @@ async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str
return subquery, None
async def read_webpage(web_url: str) -> str:
async def read_webpage_at_url(web_url: str) -> str:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
@ -129,3 +144,7 @@ async def read_webpage_with_olostep(web_url: str) -> str:
response.raise_for_status()
response_json = await response.json()
return response_json["markdown_content"]
def online_search_enabled():
return SERPER_DEV_API_KEY is not None

View file

@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel
from khoj.utils.helpers import (
ConversationCommand,
is_none_or_empty,
is_valid_url,
log_telemetry,
mode_descriptions_for_llm,
timer,
@ -229,6 +230,35 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
return ConversationCommand.Default
async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
"""
Infer webpage links from the given query
"""
location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
chat_history = construct_chat_history(conversation_history)
utc_date = datetime.utcnow().strftime("%Y-%m-%d")
online_queries_prompt = prompts.infer_webpages_to_read.format(
current_date=utc_date,
query=q,
chat_history=chat_history,
location=location,
)
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
# Validate that the response is a non-empty, JSON-serializable list of URLs
try:
response = response.strip()
urls = json.loads(response)
valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
if is_none_or_empty(valid_unique_urls):
raise ValueError(f"Invalid list of urls: {response}")
return list(valid_unique_urls)
except Exception:
raise ValueError(f"Invalid list of urls: {response}")
async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
"""
Generate subqueries from the given query

View file

@ -15,6 +15,7 @@ from os import path
from pathlib import Path
from time import perf_counter
from typing import TYPE_CHECKING, Optional, Union
from urllib.parse import urlparse
import torch
from asgiref.sync import sync_to_async
@ -340,3 +341,12 @@ def in_debug_mode():
"""Check if Khoj is running in debug mode.
Set KHOJ_DEBUG environment variable to true to enable debug mode."""
return is_env_var_true("KHOJ_DEBUG")
def is_valid_url(url: str) -> bool:
"""Check if a string is a valid URL"""
try:
result = urlparse(url.strip())
return all([result.scheme, result.netloc])
except:
return False

View file

@ -7,7 +7,10 @@ import pytest
from scipy.stats import linregress
from khoj.processor.embeddings import EmbeddingsModel
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
from khoj.processor.tools.online_search import (
read_webpage_at_url,
read_webpage_with_olostep,
)
from khoj.utils import helpers
@ -90,7 +93,7 @@ async def test_reading_webpage():
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
# Act
response = await read_webpage(website)
response = await read_webpage_at_url(website)
# Assert
assert (