Create chat actor for directly reading webpages based on user message

- Add prompt for the read webpages chat actor to extract, infer
  webpage links
- Make chat actor infer or extract webpage to read directly from user
  message
- Rename previous read_webpage function to more narrow
  read_webpage_at_url function
This commit is contained in:
Debanjum Singh Solanky 2024-03-13 15:22:57 +05:30
parent e549824fe2
commit 6118d1ff57
5 changed files with 112 additions and 6 deletions

View file

@ -380,6 +380,50 @@ Khoj:
""".strip() """.strip()
) )
infer_webpages_to_read = PromptTemplate.from_template(
"""
You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
- You will receive the conversation history as context.
- Add as much context from the previous questions and answers as required to construct the webpage urls.
- Use multiple web page urls if required to retrieve the relevant information.
- You have access to the the whole internet to retrieve information.
Which webpages will you need to read to answer the user's question?
Provide web page links as a list of strings in a JSON object.
Current Date: {current_date}
User's Location: {location}
Here are some examples:
History:
User: I like to use Hacker News to get my tech news.
AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups.
Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345
Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}}
History:
User: I'm currently living in New York but I'm thinking about moving to San Francisco.
AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene.
Q: What is the climate like in those cities?
Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}}
History:
User: Hey, how is it going?
AI: Not too bad. How can I help you today?
Q: What's the latest news on r/worldnews?
Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}}
Now it's your turn to share actual webpage urls you'd like to read to answer the user's question.
History:
{chat_history}
Q: {query}
Khoj:
""".strip()
)
online_search_conversation_subqueries = PromptTemplate.from_template( online_search_conversation_subqueries = PromptTemplate.from_template(
""" """
You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question. You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.

View file

@ -2,6 +2,7 @@ import asyncio
import json import json
import logging import logging
import os import os
from collections import defaultdict
from typing import Dict, Tuple, Union from typing import Dict, Tuple, Union
import aiohttp import aiohttp
@ -9,7 +10,11 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from markdownify import markdownify from markdownify import markdownify
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries from khoj.routers.helpers import (
extract_relevant_info,
generate_online_subqueries,
infer_webpage_urls,
)
from khoj.utils.helpers import is_none_or_empty, timer from khoj.utils.helpers import is_none_or_empty, timer
from khoj.utils.rawconfig import LocationData from khoj.utils.rawconfig import LocationData
@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1
async def search_online(query: str, conversation_history: dict, location: LocationData): async def search_online(query: str, conversation_history: dict, location: LocationData):
if SERPER_DEV_API_KEY is None: if not online_search_enabled():
logger.warn("SERPER_DEV_API_KEY is not set") logger.warn("SERPER_DEV_API_KEY is not set")
return {} return {}
@ -93,10 +98,20 @@ def search_with_google(subquery: str):
return extracted_search_result return extracted_search_result
async def read_webpages(query: str, conversation_history: dict, location: LocationData):
"Infer web pages to read from the query and extract relevant information from them"
urls = await infer_webpage_urls(query, conversation_history, location)
results: Dict[str, Dict[str, str]] = defaultdict(dict)
for url in urls:
_, result = await read_webpage_and_extract_content(query, url)
results[url]["extracted_content"] = result
return results
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]: async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]:
try: try:
with timer(f"Reading web page at '{url}' took", logger): with timer(f"Reading web page at '{url}' took", logger):
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url) content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
with timer(f"Extracting relevant information from web page at '{url}' took", logger): with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subquery, content) extracted_info = await extract_relevant_info(subquery, content)
return subquery, extracted_info return subquery, extracted_info
@ -105,7 +120,7 @@ async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str
return subquery, None return subquery, None
async def read_webpage(web_url: str) -> str: async def read_webpage_at_url(web_url: str) -> str:
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
} }
@ -129,3 +144,7 @@ async def read_webpage_with_olostep(web_url: str) -> str:
response.raise_for_status() response.raise_for_status()
response_json = await response.json() response_json = await response.json()
return response_json["markdown_content"] return response_json["markdown_content"]
def online_search_enabled():
return SERPER_DEV_API_KEY is not None

View file

@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel
from khoj.utils.helpers import ( from khoj.utils.helpers import (
ConversationCommand, ConversationCommand,
is_none_or_empty, is_none_or_empty,
is_valid_url,
log_telemetry, log_telemetry,
mode_descriptions_for_llm, mode_descriptions_for_llm,
timer, timer,
@ -229,6 +230,35 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
return ConversationCommand.Default return ConversationCommand.Default
async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
"""
Infer webpage links from the given query
"""
location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
chat_history = construct_chat_history(conversation_history)
utc_date = datetime.utcnow().strftime("%Y-%m-%d")
online_queries_prompt = prompts.infer_webpages_to_read.format(
current_date=utc_date,
query=q,
chat_history=chat_history,
location=location,
)
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
# Validate that the response is a non-empty, JSON-serializable list of URLs
try:
response = response.strip()
urls = json.loads(response)
valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
if is_none_or_empty(valid_unique_urls):
raise ValueError(f"Invalid list of urls: {response}")
return list(valid_unique_urls)
except Exception:
raise ValueError(f"Invalid list of urls: {response}")
async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
""" """
Generate subqueries from the given query Generate subqueries from the given query

View file

@ -15,6 +15,7 @@ from os import path
from pathlib import Path from pathlib import Path
from time import perf_counter from time import perf_counter
from typing import TYPE_CHECKING, Optional, Union from typing import TYPE_CHECKING, Optional, Union
from urllib.parse import urlparse
import torch import torch
from asgiref.sync import sync_to_async from asgiref.sync import sync_to_async
@ -340,3 +341,12 @@ def in_debug_mode():
"""Check if Khoj is running in debug mode. """Check if Khoj is running in debug mode.
Set KHOJ_DEBUG environment variable to true to enable debug mode.""" Set KHOJ_DEBUG environment variable to true to enable debug mode."""
return is_env_var_true("KHOJ_DEBUG") return is_env_var_true("KHOJ_DEBUG")
def is_valid_url(url: str) -> bool:
"""Check if a string is a valid URL"""
try:
result = urlparse(url.strip())
return all([result.scheme, result.netloc])
except:
return False

View file

@ -7,7 +7,10 @@ import pytest
from scipy.stats import linregress from scipy.stats import linregress
from khoj.processor.embeddings import EmbeddingsModel from khoj.processor.embeddings import EmbeddingsModel
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep from khoj.processor.tools.online_search import (
read_webpage_at_url,
read_webpage_with_olostep,
)
from khoj.utils import helpers from khoj.utils import helpers
@ -90,7 +93,7 @@ async def test_reading_webpage():
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire" website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
# Act # Act
response = await read_webpage(website) response = await read_webpage_at_url(website)
# Assert # Assert
assert ( assert (