mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Create chat actor for directly reading webpages based on user message
- Add prompt for the read webpages chat actor to extract, infer webpage links - Make chat actor infer or extract webpage to read directly from user message - Rename previous read_webpage function to more narrow read_webpage_at_url function
This commit is contained in:
parent
e549824fe2
commit
6118d1ff57
5 changed files with 112 additions and 6 deletions
|
@ -380,6 +380,50 @@ Khoj:
|
|||
""".strip()
|
||||
)
|
||||
|
||||
infer_webpages_to_read = PromptTemplate.from_template(
|
||||
"""
|
||||
You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
|
||||
- You will receive the conversation history as context.
|
||||
- Add as much context from the previous questions and answers as required to construct the webpage urls.
|
||||
- Use multiple web page urls if required to retrieve the relevant information.
|
||||
- You have access to the the whole internet to retrieve information.
|
||||
|
||||
Which webpages will you need to read to answer the user's question?
|
||||
Provide web page links as a list of strings in a JSON object.
|
||||
Current Date: {current_date}
|
||||
User's Location: {location}
|
||||
|
||||
Here are some examples:
|
||||
History:
|
||||
User: I like to use Hacker News to get my tech news.
|
||||
AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups.
|
||||
|
||||
Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345
|
||||
Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}}
|
||||
|
||||
History:
|
||||
User: I'm currently living in New York but I'm thinking about moving to San Francisco.
|
||||
AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene.
|
||||
|
||||
Q: What is the climate like in those cities?
|
||||
Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}}
|
||||
|
||||
History:
|
||||
User: Hey, how is it going?
|
||||
AI: Not too bad. How can I help you today?
|
||||
|
||||
Q: What's the latest news on r/worldnews?
|
||||
Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}}
|
||||
|
||||
Now it's your turn to share actual webpage urls you'd like to read to answer the user's question.
|
||||
History:
|
||||
{chat_history}
|
||||
|
||||
Q: {query}
|
||||
Khoj:
|
||||
""".strip()
|
||||
)
|
||||
|
||||
online_search_conversation_subqueries = PromptTemplate.from_template(
|
||||
"""
|
||||
You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.
|
||||
|
|
|
@ -2,6 +2,7 @@ import asyncio
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Tuple, Union
|
||||
|
||||
import aiohttp
|
||||
|
@ -9,7 +10,11 @@ import requests
|
|||
from bs4 import BeautifulSoup
|
||||
from markdownify import markdownify
|
||||
|
||||
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
|
||||
from khoj.routers.helpers import (
|
||||
extract_relevant_info,
|
||||
generate_online_subqueries,
|
||||
infer_webpage_urls,
|
||||
)
|
||||
from khoj.utils.helpers import is_none_or_empty, timer
|
||||
from khoj.utils.rawconfig import LocationData
|
||||
|
||||
|
@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1
|
|||
|
||||
|
||||
async def search_online(query: str, conversation_history: dict, location: LocationData):
|
||||
if SERPER_DEV_API_KEY is None:
|
||||
if not online_search_enabled():
|
||||
logger.warn("SERPER_DEV_API_KEY is not set")
|
||||
return {}
|
||||
|
||||
|
@ -93,10 +98,20 @@ def search_with_google(subquery: str):
|
|||
return extracted_search_result
|
||||
|
||||
|
||||
async def read_webpages(query: str, conversation_history: dict, location: LocationData):
|
||||
"Infer web pages to read from the query and extract relevant information from them"
|
||||
urls = await infer_webpage_urls(query, conversation_history, location)
|
||||
results: Dict[str, Dict[str, str]] = defaultdict(dict)
|
||||
for url in urls:
|
||||
_, result = await read_webpage_and_extract_content(query, url)
|
||||
results[url]["extracted_content"] = result
|
||||
return results
|
||||
|
||||
|
||||
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]:
|
||||
try:
|
||||
with timer(f"Reading web page at '{url}' took", logger):
|
||||
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url)
|
||||
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
|
||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||
extracted_info = await extract_relevant_info(subquery, content)
|
||||
return subquery, extracted_info
|
||||
|
@ -105,7 +120,7 @@ async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str
|
|||
return subquery, None
|
||||
|
||||
|
||||
async def read_webpage(web_url: str) -> str:
|
||||
async def read_webpage_at_url(web_url: str) -> str:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
|
||||
}
|
||||
|
@ -129,3 +144,7 @@ async def read_webpage_with_olostep(web_url: str) -> str:
|
|||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["markdown_content"]
|
||||
|
||||
|
||||
def online_search_enabled():
|
||||
return SERPER_DEV_API_KEY is not None
|
||||
|
|
|
@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel
|
|||
from khoj.utils.helpers import (
|
||||
ConversationCommand,
|
||||
is_none_or_empty,
|
||||
is_valid_url,
|
||||
log_telemetry,
|
||||
mode_descriptions_for_llm,
|
||||
timer,
|
||||
|
@ -229,6 +230,35 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
|
|||
return ConversationCommand.Default
|
||||
|
||||
|
||||
async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
|
||||
"""
|
||||
Infer webpage links from the given query
|
||||
"""
|
||||
location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
|
||||
chat_history = construct_chat_history(conversation_history)
|
||||
|
||||
utc_date = datetime.utcnow().strftime("%Y-%m-%d")
|
||||
online_queries_prompt = prompts.infer_webpages_to_read.format(
|
||||
current_date=utc_date,
|
||||
query=q,
|
||||
chat_history=chat_history,
|
||||
location=location,
|
||||
)
|
||||
|
||||
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
|
||||
|
||||
# Validate that the response is a non-empty, JSON-serializable list of URLs
|
||||
try:
|
||||
response = response.strip()
|
||||
urls = json.loads(response)
|
||||
valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
|
||||
if is_none_or_empty(valid_unique_urls):
|
||||
raise ValueError(f"Invalid list of urls: {response}")
|
||||
return list(valid_unique_urls)
|
||||
except Exception:
|
||||
raise ValueError(f"Invalid list of urls: {response}")
|
||||
|
||||
|
||||
async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
|
||||
"""
|
||||
Generate subqueries from the given query
|
||||
|
|
|
@ -15,6 +15,7 @@ from os import path
|
|||
from pathlib import Path
|
||||
from time import perf_counter
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import torch
|
||||
from asgiref.sync import sync_to_async
|
||||
|
@ -340,3 +341,12 @@ def in_debug_mode():
|
|||
"""Check if Khoj is running in debug mode.
|
||||
Set KHOJ_DEBUG environment variable to true to enable debug mode."""
|
||||
return is_env_var_true("KHOJ_DEBUG")
|
||||
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
"""Check if a string is a valid URL"""
|
||||
try:
|
||||
result = urlparse(url.strip())
|
||||
return all([result.scheme, result.netloc])
|
||||
except:
|
||||
return False
|
||||
|
|
|
@ -7,7 +7,10 @@ import pytest
|
|||
from scipy.stats import linregress
|
||||
|
||||
from khoj.processor.embeddings import EmbeddingsModel
|
||||
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
|
||||
from khoj.processor.tools.online_search import (
|
||||
read_webpage_at_url,
|
||||
read_webpage_with_olostep,
|
||||
)
|
||||
from khoj.utils import helpers
|
||||
|
||||
|
||||
|
@ -90,7 +93,7 @@ async def test_reading_webpage():
|
|||
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
||||
|
||||
# Act
|
||||
response = await read_webpage(website)
|
||||
response = await read_webpage_at_url(website)
|
||||
|
||||
# Assert
|
||||
assert (
|
||||
|
|
Loading…
Reference in a new issue