mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Create chat actor for directly reading webpages based on user message
- Add prompt for the read webpages chat actor to extract, infer webpage links - Make chat actor infer or extract webpage to read directly from user message - Rename previous read_webpage function to more narrow read_webpage_at_url function
This commit is contained in:
parent
e549824fe2
commit
6118d1ff57
5 changed files with 112 additions and 6 deletions
|
@ -380,6 +380,50 @@ Khoj:
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
infer_webpages_to_read = PromptTemplate.from_template(
|
||||||
|
"""
|
||||||
|
You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
|
||||||
|
- You will receive the conversation history as context.
|
||||||
|
- Add as much context from the previous questions and answers as required to construct the webpage urls.
|
||||||
|
- Use multiple web page urls if required to retrieve the relevant information.
|
||||||
|
- You have access to the the whole internet to retrieve information.
|
||||||
|
|
||||||
|
Which webpages will you need to read to answer the user's question?
|
||||||
|
Provide web page links as a list of strings in a JSON object.
|
||||||
|
Current Date: {current_date}
|
||||||
|
User's Location: {location}
|
||||||
|
|
||||||
|
Here are some examples:
|
||||||
|
History:
|
||||||
|
User: I like to use Hacker News to get my tech news.
|
||||||
|
AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups.
|
||||||
|
|
||||||
|
Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345
|
||||||
|
Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}}
|
||||||
|
|
||||||
|
History:
|
||||||
|
User: I'm currently living in New York but I'm thinking about moving to San Francisco.
|
||||||
|
AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene.
|
||||||
|
|
||||||
|
Q: What is the climate like in those cities?
|
||||||
|
Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}}
|
||||||
|
|
||||||
|
History:
|
||||||
|
User: Hey, how is it going?
|
||||||
|
AI: Not too bad. How can I help you today?
|
||||||
|
|
||||||
|
Q: What's the latest news on r/worldnews?
|
||||||
|
Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}}
|
||||||
|
|
||||||
|
Now it's your turn to share actual webpage urls you'd like to read to answer the user's question.
|
||||||
|
History:
|
||||||
|
{chat_history}
|
||||||
|
|
||||||
|
Q: {query}
|
||||||
|
Khoj:
|
||||||
|
""".strip()
|
||||||
|
)
|
||||||
|
|
||||||
online_search_conversation_subqueries = PromptTemplate.from_template(
|
online_search_conversation_subqueries = PromptTemplate.from_template(
|
||||||
"""
|
"""
|
||||||
You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.
|
You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.
|
||||||
|
|
|
@ -2,6 +2,7 @@ import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
from typing import Dict, Tuple, Union
|
from typing import Dict, Tuple, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
@ -9,7 +10,11 @@ import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from markdownify import markdownify
|
from markdownify import markdownify
|
||||||
|
|
||||||
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
|
from khoj.routers.helpers import (
|
||||||
|
extract_relevant_info,
|
||||||
|
generate_online_subqueries,
|
||||||
|
infer_webpage_urls,
|
||||||
|
)
|
||||||
from khoj.utils.helpers import is_none_or_empty, timer
|
from khoj.utils.helpers import is_none_or_empty, timer
|
||||||
from khoj.utils.rawconfig import LocationData
|
from khoj.utils.rawconfig import LocationData
|
||||||
|
|
||||||
|
@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1
|
||||||
|
|
||||||
|
|
||||||
async def search_online(query: str, conversation_history: dict, location: LocationData):
|
async def search_online(query: str, conversation_history: dict, location: LocationData):
|
||||||
if SERPER_DEV_API_KEY is None:
|
if not online_search_enabled():
|
||||||
logger.warn("SERPER_DEV_API_KEY is not set")
|
logger.warn("SERPER_DEV_API_KEY is not set")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -93,10 +98,20 @@ def search_with_google(subquery: str):
|
||||||
return extracted_search_result
|
return extracted_search_result
|
||||||
|
|
||||||
|
|
||||||
|
async def read_webpages(query: str, conversation_history: dict, location: LocationData):
|
||||||
|
"Infer web pages to read from the query and extract relevant information from them"
|
||||||
|
urls = await infer_webpage_urls(query, conversation_history, location)
|
||||||
|
results: Dict[str, Dict[str, str]] = defaultdict(dict)
|
||||||
|
for url in urls:
|
||||||
|
_, result = await read_webpage_and_extract_content(query, url)
|
||||||
|
results[url]["extracted_content"] = result
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]:
|
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]:
|
||||||
try:
|
try:
|
||||||
with timer(f"Reading web page at '{url}' took", logger):
|
with timer(f"Reading web page at '{url}' took", logger):
|
||||||
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url)
|
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
|
||||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||||
extracted_info = await extract_relevant_info(subquery, content)
|
extracted_info = await extract_relevant_info(subquery, content)
|
||||||
return subquery, extracted_info
|
return subquery, extracted_info
|
||||||
|
@ -105,7 +120,7 @@ async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str
|
||||||
return subquery, None
|
return subquery, None
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage(web_url: str) -> str:
|
async def read_webpage_at_url(web_url: str) -> str:
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
|
||||||
}
|
}
|
||||||
|
@ -129,3 +144,7 @@ async def read_webpage_with_olostep(web_url: str) -> str:
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
response_json = await response.json()
|
response_json = await response.json()
|
||||||
return response_json["markdown_content"]
|
return response_json["markdown_content"]
|
||||||
|
|
||||||
|
|
||||||
|
def online_search_enabled():
|
||||||
|
return SERPER_DEV_API_KEY is not None
|
||||||
|
|
|
@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel
|
||||||
from khoj.utils.helpers import (
|
from khoj.utils.helpers import (
|
||||||
ConversationCommand,
|
ConversationCommand,
|
||||||
is_none_or_empty,
|
is_none_or_empty,
|
||||||
|
is_valid_url,
|
||||||
log_telemetry,
|
log_telemetry,
|
||||||
mode_descriptions_for_llm,
|
mode_descriptions_for_llm,
|
||||||
timer,
|
timer,
|
||||||
|
@ -229,6 +230,35 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
|
||||||
return ConversationCommand.Default
|
return ConversationCommand.Default
|
||||||
|
|
||||||
|
|
||||||
|
async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
|
||||||
|
"""
|
||||||
|
Infer webpage links from the given query
|
||||||
|
"""
|
||||||
|
location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
|
||||||
|
chat_history = construct_chat_history(conversation_history)
|
||||||
|
|
||||||
|
utc_date = datetime.utcnow().strftime("%Y-%m-%d")
|
||||||
|
online_queries_prompt = prompts.infer_webpages_to_read.format(
|
||||||
|
current_date=utc_date,
|
||||||
|
query=q,
|
||||||
|
chat_history=chat_history,
|
||||||
|
location=location,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
|
||||||
|
|
||||||
|
# Validate that the response is a non-empty, JSON-serializable list of URLs
|
||||||
|
try:
|
||||||
|
response = response.strip()
|
||||||
|
urls = json.loads(response)
|
||||||
|
valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
|
||||||
|
if is_none_or_empty(valid_unique_urls):
|
||||||
|
raise ValueError(f"Invalid list of urls: {response}")
|
||||||
|
return list(valid_unique_urls)
|
||||||
|
except Exception:
|
||||||
|
raise ValueError(f"Invalid list of urls: {response}")
|
||||||
|
|
||||||
|
|
||||||
async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
|
async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Generate subqueries from the given query
|
Generate subqueries from the given query
|
||||||
|
|
|
@ -15,6 +15,7 @@ from os import path
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import perf_counter
|
from time import perf_counter
|
||||||
from typing import TYPE_CHECKING, Optional, Union
|
from typing import TYPE_CHECKING, Optional, Union
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from asgiref.sync import sync_to_async
|
from asgiref.sync import sync_to_async
|
||||||
|
@ -340,3 +341,12 @@ def in_debug_mode():
|
||||||
"""Check if Khoj is running in debug mode.
|
"""Check if Khoj is running in debug mode.
|
||||||
Set KHOJ_DEBUG environment variable to true to enable debug mode."""
|
Set KHOJ_DEBUG environment variable to true to enable debug mode."""
|
||||||
return is_env_var_true("KHOJ_DEBUG")
|
return is_env_var_true("KHOJ_DEBUG")
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_url(url: str) -> bool:
|
||||||
|
"""Check if a string is a valid URL"""
|
||||||
|
try:
|
||||||
|
result = urlparse(url.strip())
|
||||||
|
return all([result.scheme, result.netloc])
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
|
@ -7,7 +7,10 @@ import pytest
|
||||||
from scipy.stats import linregress
|
from scipy.stats import linregress
|
||||||
|
|
||||||
from khoj.processor.embeddings import EmbeddingsModel
|
from khoj.processor.embeddings import EmbeddingsModel
|
||||||
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
|
from khoj.processor.tools.online_search import (
|
||||||
|
read_webpage_at_url,
|
||||||
|
read_webpage_with_olostep,
|
||||||
|
)
|
||||||
from khoj.utils import helpers
|
from khoj.utils import helpers
|
||||||
|
|
||||||
|
|
||||||
|
@ -90,7 +93,7 @@ async def test_reading_webpage():
|
||||||
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
response = await read_webpage(website)
|
response = await read_webpage_at_url(website)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert (
|
assert (
|
||||||
|
|
Loading…
Add table
Reference in a new issue