mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Allow directly reading web pages, even when SERP not enabled (#676)
### Overview Khoj can now read website directly without needing to go through the search step first ### Details - Parallelize simple webpage read and extractor - Rename extract_content online results field to web pages - Tweak prompts to extract information from webpages, online results - Test select webpage as data source and extract web urls chat actors - Render webpage read in chat response references on Web, Desktop apps - Pass multiple webpages with their urls in online results context - Support webpage command in chat API - Add webpage chat command for read web pages requested by user - Create chat actor for directly reading webpages based on user message
This commit is contained in:
commit
586654e2af
11 changed files with 237 additions and 48 deletions
|
@ -87,7 +87,7 @@
|
|||
function generateOnlineReference(reference, index) {
|
||||
|
||||
// Generate HTML for Chat Reference
|
||||
let title = reference.title;
|
||||
let title = reference.title || reference.link;
|
||||
let link = reference.link;
|
||||
let snippet = reference.snippet;
|
||||
let question = reference.question;
|
||||
|
@ -191,6 +191,15 @@
|
|||
referenceSection.appendChild(polishedReference);
|
||||
}
|
||||
}
|
||||
|
||||
if (onlineReference.webpages && onlineReference.webpages.length > 0) {
|
||||
numOnlineReferences += onlineReference.webpages.length;
|
||||
for (let index in onlineReference.webpages) {
|
||||
let reference = onlineReference.webpages[index];
|
||||
let polishedReference = generateOnlineReference(reference, index);
|
||||
referenceSection.appendChild(polishedReference);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numOnlineReferences;
|
||||
|
|
|
@ -101,7 +101,7 @@ To get started, just start typing below. You can also type / to see a list of co
|
|||
function generateOnlineReference(reference, index) {
|
||||
|
||||
// Generate HTML for Chat Reference
|
||||
let title = reference.title;
|
||||
let title = reference.title || reference.link;
|
||||
let link = reference.link;
|
||||
let snippet = reference.snippet;
|
||||
let question = reference.question;
|
||||
|
@ -205,6 +205,15 @@ To get started, just start typing below. You can also type / to see a list of co
|
|||
referenceSection.appendChild(polishedReference);
|
||||
}
|
||||
}
|
||||
|
||||
if (onlineReference.webpages && onlineReference.webpages.length > 0) {
|
||||
numOnlineReferences += onlineReference.webpages.length;
|
||||
for (let index in onlineReference.webpages) {
|
||||
let reference = onlineReference.webpages[index];
|
||||
let polishedReference = generateOnlineReference(reference, index);
|
||||
referenceSection.appendChild(polishedReference);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numOnlineReferences;
|
||||
|
|
|
@ -177,8 +177,8 @@ def converse_offline(
|
|||
if ConversationCommand.Online in conversation_commands:
|
||||
simplified_online_results = online_results.copy()
|
||||
for result in online_results:
|
||||
if online_results[result].get("extracted_content"):
|
||||
simplified_online_results[result] = online_results[result]["extracted_content"]
|
||||
if online_results[result].get("webpages"):
|
||||
simplified_online_results[result] = online_results[result]["webpages"]
|
||||
|
||||
conversation_primer = f"{prompts.online_search_conversation.format(online_results=str(simplified_online_results))}\n{conversation_primer}"
|
||||
if not is_none_or_empty(compiled_references_message):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import json
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
from typing import Dict, Optional
|
||||
|
||||
from langchain.schema import ChatMessage
|
||||
|
||||
|
@ -104,7 +104,7 @@ def send_message_to_model(messages, api_key, model, response_type="text"):
|
|||
def converse(
|
||||
references,
|
||||
user_query,
|
||||
online_results: Optional[dict] = None,
|
||||
online_results: Optional[Dict[str, Dict]] = None,
|
||||
conversation_log={},
|
||||
model: str = "gpt-3.5-turbo",
|
||||
api_key: Optional[str] = None,
|
||||
|
@ -142,7 +142,7 @@ def converse(
|
|||
completion_func(chat_response=prompts.no_online_results_found.format())
|
||||
return iter([prompts.no_online_results_found.format()])
|
||||
|
||||
if ConversationCommand.Online in conversation_commands:
|
||||
if ConversationCommand.Online in conversation_commands or ConversationCommand.Webpage in conversation_commands:
|
||||
conversation_primer = (
|
||||
f"{prompts.online_search_conversation.format(online_results=str(online_results))}\n{conversation_primer}"
|
||||
)
|
||||
|
@ -158,7 +158,7 @@ def converse(
|
|||
max_prompt_size,
|
||||
tokenizer_name,
|
||||
)
|
||||
truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
|
||||
truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages})
|
||||
logger.debug(f"Conversation Context for GPT: {truncated_messages}")
|
||||
|
||||
# Get Response from GPT
|
||||
|
|
|
@ -10,7 +10,7 @@ You were created by Khoj Inc. with the following capabilities:
|
|||
|
||||
- You *CAN REMEMBER ALL NOTES and PERSONAL INFORMATION FOREVER* that the user ever shares with you.
|
||||
- Users can share files and other information with you using the Khoj Desktop, Obsidian or Emacs app. They can also drag and drop their files into the chat window.
|
||||
- You can generate images, look-up information from the internet, and answer questions based on the user's notes.
|
||||
- You *CAN* generate images, look-up real-time information from the internet, and answer questions based on the user's notes.
|
||||
- You cannot set reminders.
|
||||
- Say "I don't know" or "I don't understand" if you don't know what to say or if you don't know the answer to a question.
|
||||
- Ask crisp follow-up questions to get additional context, when the answer cannot be inferred from the provided notes or past conversations.
|
||||
|
@ -146,7 +146,8 @@ online_search_conversation = PromptTemplate.from_template(
|
|||
Use this up-to-date information from the internet to inform your response.
|
||||
Ask crisp follow-up questions to get additional context, when a helpful response cannot be provided from the online data or past conversations.
|
||||
|
||||
Information from the internet: {online_results}
|
||||
Information from the internet:
|
||||
{online_results}
|
||||
""".strip()
|
||||
)
|
||||
|
||||
|
@ -280,7 +281,7 @@ Target Query: {query}
|
|||
Web Pages:
|
||||
{corpus}
|
||||
|
||||
Collate the relevant information from the website to answer the target query.
|
||||
Collate only relevant information from the website to answer the target query.
|
||||
""".strip()
|
||||
)
|
||||
|
||||
|
@ -362,6 +363,14 @@ AI: Good morning! How can I help you today?
|
|||
Q: How can I share my files with Khoj?
|
||||
Khoj: {{"source": ["default", "online"]}}
|
||||
|
||||
Example:
|
||||
Chat History:
|
||||
User: What is the first element in the periodic table?
|
||||
AI: The first element in the periodic table is Hydrogen.
|
||||
|
||||
Q: Summarize this article https://en.wikipedia.org/wiki/Hydrogen
|
||||
Khoj: {{"source": ["webpage"]}}
|
||||
|
||||
Example:
|
||||
Chat History:
|
||||
User: I want to start a new hobby. I'm thinking of learning to play the guitar.
|
||||
|
@ -380,6 +389,50 @@ Khoj:
|
|||
""".strip()
|
||||
)
|
||||
|
||||
infer_webpages_to_read = PromptTemplate.from_template(
|
||||
"""
|
||||
You are Khoj, an advanced web page reading assistant. You are to construct **up to three, valid** webpage urls to read before answering the user's question.
|
||||
- You will receive the conversation history as context.
|
||||
- Add as much context from the previous questions and answers as required to construct the webpage urls.
|
||||
- Use multiple web page urls if required to retrieve the relevant information.
|
||||
- You have access to the the whole internet to retrieve information.
|
||||
|
||||
Which webpages will you need to read to answer the user's question?
|
||||
Provide web page links as a list of strings in a JSON object.
|
||||
Current Date: {current_date}
|
||||
User's Location: {location}
|
||||
|
||||
Here are some examples:
|
||||
History:
|
||||
User: I like to use Hacker News to get my tech news.
|
||||
AI: Hacker News is an online forum for sharing and discussing the latest tech news. It is a great place to learn about new technologies and startups.
|
||||
|
||||
Q: Summarize this post about vector database on Hacker News, https://news.ycombinator.com/item?id=12345
|
||||
Khoj: {{"links": ["https://news.ycombinator.com/item?id=12345"]}}
|
||||
|
||||
History:
|
||||
User: I'm currently living in New York but I'm thinking about moving to San Francisco.
|
||||
AI: New York is a great city to live in. It has a lot of great restaurants and museums. San Francisco is also a great city to live in. It has good access to nature and a great tech scene.
|
||||
|
||||
Q: What is the climate like in those cities?
|
||||
Khoj: {{"links": ["https://en.wikipedia.org/wiki/New_York_City", "https://en.wikipedia.org/wiki/San_Francisco"]}}
|
||||
|
||||
History:
|
||||
User: Hey, how is it going?
|
||||
AI: Not too bad. How can I help you today?
|
||||
|
||||
Q: What's the latest news on r/worldnews?
|
||||
Khoj: {{"links": ["https://www.reddit.com/r/worldnews/"]}}
|
||||
|
||||
Now it's your turn to share actual webpage urls you'd like to read to answer the user's question.
|
||||
History:
|
||||
{chat_history}
|
||||
|
||||
Q: {query}
|
||||
Khoj:
|
||||
""".strip()
|
||||
)
|
||||
|
||||
online_search_conversation_subqueries = PromptTemplate.from_template(
|
||||
"""
|
||||
You are Khoj, an advanced google search assistant. You are tasked with constructing **up to three** google search queries to answer the user's question.
|
||||
|
|
|
@ -2,6 +2,7 @@ import asyncio
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Tuple, Union
|
||||
|
||||
import aiohttp
|
||||
|
@ -9,7 +10,11 @@ import requests
|
|||
from bs4 import BeautifulSoup
|
||||
from markdownify import markdownify
|
||||
|
||||
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
|
||||
from khoj.routers.helpers import (
|
||||
extract_relevant_info,
|
||||
generate_online_subqueries,
|
||||
infer_webpage_urls,
|
||||
)
|
||||
from khoj.utils.helpers import is_none_or_empty, timer
|
||||
from khoj.utils.rawconfig import LocationData
|
||||
|
||||
|
@ -38,7 +43,7 @@ MAX_WEBPAGES_TO_READ = 1
|
|||
|
||||
|
||||
async def search_online(query: str, conversation_history: dict, location: LocationData):
|
||||
if SERPER_DEV_API_KEY is None:
|
||||
if not online_search_enabled():
|
||||
logger.warn("SERPER_DEV_API_KEY is not set")
|
||||
return {}
|
||||
|
||||
|
@ -52,24 +57,21 @@ async def search_online(query: str, conversation_history: dict, location: Locati
|
|||
|
||||
# Gather distinct web pages from organic search results of each subquery without an instant answer
|
||||
webpage_links = {
|
||||
result["link"]
|
||||
organic["link"]: subquery
|
||||
for subquery in response_dict
|
||||
for result in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
|
||||
for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
|
||||
if "answerBox" not in response_dict[subquery]
|
||||
}
|
||||
|
||||
# Read, extract relevant info from the retrieved web pages
|
||||
tasks = []
|
||||
for webpage_link in webpage_links:
|
||||
logger.info(f"Reading web page at '{webpage_link}'")
|
||||
task = read_webpage_and_extract_content(subquery, webpage_link)
|
||||
tasks.append(task)
|
||||
logger.info(f"Reading web pages at: {webpage_links.keys()}")
|
||||
tasks = [read_webpage_and_extract_content(subquery, link) for link, subquery in webpage_links.items()]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Collect extracted info from the retrieved web pages
|
||||
for subquery, extracted_webpage_content in results:
|
||||
if extracted_webpage_content is not None:
|
||||
response_dict[subquery]["extracted_content"] = extracted_webpage_content
|
||||
for subquery, webpage_extract, url in results:
|
||||
if webpage_extract is not None:
|
||||
response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
|
||||
|
||||
return response_dict
|
||||
|
||||
|
@ -93,19 +95,35 @@ def search_with_google(subquery: str):
|
|||
return extracted_search_result
|
||||
|
||||
|
||||
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str]]:
|
||||
async def read_webpages(query: str, conversation_history: dict, location: LocationData):
|
||||
"Infer web pages to read from the query and extract relevant information from them"
|
||||
logger.info(f"Inferring web pages to read")
|
||||
urls = await infer_webpage_urls(query, conversation_history, location)
|
||||
|
||||
logger.info(f"Reading web pages at: {urls}")
|
||||
tasks = [read_webpage_and_extract_content(query, url) for url in urls]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
response: Dict[str, Dict] = defaultdict(dict)
|
||||
response[query]["webpages"] = [
|
||||
{"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
|
||||
]
|
||||
return response
|
||||
|
||||
|
||||
async def read_webpage_and_extract_content(subquery: str, url: str) -> Tuple[str, Union[None, str], str]:
|
||||
try:
|
||||
with timer(f"Reading web page at '{url}' took", logger):
|
||||
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url)
|
||||
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_at_url(url)
|
||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||
extracted_info = await extract_relevant_info(subquery, content)
|
||||
return subquery, extracted_info
|
||||
return subquery, extracted_info, url
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read web page at '{url}' with {e}")
|
||||
return subquery, None
|
||||
return subquery, None, url
|
||||
|
||||
|
||||
async def read_webpage(web_url: str) -> str:
|
||||
async def read_webpage_at_url(web_url: str) -> str:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
|
||||
}
|
||||
|
@ -129,3 +147,7 @@ async def read_webpage_with_olostep(web_url: str) -> str:
|
|||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["markdown_content"]
|
||||
|
||||
|
||||
def online_search_enabled():
|
||||
return SERPER_DEV_API_KEY is not None
|
||||
|
|
|
@ -14,7 +14,11 @@ from khoj.database.adapters import ConversationAdapters, EntryAdapters, aget_use
|
|||
from khoj.database.models import KhojUser
|
||||
from khoj.processor.conversation.prompts import help_message, no_entries_found
|
||||
from khoj.processor.conversation.utils import save_to_conversation_log
|
||||
from khoj.processor.tools.online_search import search_online
|
||||
from khoj.processor.tools.online_search import (
|
||||
online_search_enabled,
|
||||
read_webpages,
|
||||
search_online,
|
||||
)
|
||||
from khoj.routers.api import extract_references_and_questions
|
||||
from khoj.routers.helpers import (
|
||||
ApiUserRateLimiter,
|
||||
|
@ -238,6 +242,7 @@ async def chat(
|
|||
) -> Response:
|
||||
user: KhojUser = request.user.object
|
||||
q = unquote(q)
|
||||
logger.info("Chat request by {user.username}: {q}")
|
||||
|
||||
await is_ready_to_chat(user)
|
||||
conversation_commands = [get_conversation_command(query=q, any_references=True)]
|
||||
|
@ -280,7 +285,7 @@ async def chat(
|
|||
compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions(
|
||||
request, common, meta_log, q, (n or 5), (d or math.inf), conversation_commands, location
|
||||
)
|
||||
online_results: Dict = dict()
|
||||
online_results: Dict[str, Dict] = {}
|
||||
|
||||
if conversation_commands == [ConversationCommand.Notes] and not await EntryAdapters.auser_has_entries(user):
|
||||
no_entries_found_format = no_entries_found.format()
|
||||
|
@ -294,13 +299,23 @@ async def chat(
|
|||
conversation_commands.remove(ConversationCommand.Notes)
|
||||
|
||||
if ConversationCommand.Online in conversation_commands:
|
||||
if not online_search_enabled():
|
||||
conversation_commands.remove(ConversationCommand.Online)
|
||||
# If online search is not enabled, try to read webpages directly
|
||||
if ConversationCommand.Webpage not in conversation_commands:
|
||||
conversation_commands.append(ConversationCommand.Webpage)
|
||||
else:
|
||||
try:
|
||||
online_results = await search_online(defiltered_query, meta_log, location)
|
||||
except ValueError as e:
|
||||
logger.warning(f"Error searching online: {e}. Attempting to respond without online results")
|
||||
|
||||
if ConversationCommand.Webpage in conversation_commands:
|
||||
try:
|
||||
online_results = await search_online(defiltered_query, meta_log, location)
|
||||
online_results = await read_webpages(defiltered_query, meta_log, location)
|
||||
except ValueError as e:
|
||||
return StreamingResponse(
|
||||
iter(["Please set your SERPER_DEV_API_KEY to get started with online searches 🌐"]),
|
||||
media_type="text/event-stream",
|
||||
status_code=200,
|
||||
logger.warning(
|
||||
f"Error directly reading webpages: {e}. Attempting to respond without online results", exc_info=True
|
||||
)
|
||||
|
||||
if ConversationCommand.Image in conversation_commands:
|
||||
|
|
|
@ -36,6 +36,7 @@ from khoj.utils.config import GPT4AllProcessorModel
|
|||
from khoj.utils.helpers import (
|
||||
ConversationCommand,
|
||||
is_none_or_empty,
|
||||
is_valid_url,
|
||||
log_telemetry,
|
||||
mode_descriptions_for_llm,
|
||||
timer,
|
||||
|
@ -167,7 +168,8 @@ async def aget_relevant_information_sources(query: str, conversation_history: di
|
|||
chat_history=chat_history,
|
||||
)
|
||||
|
||||
response = await send_message_to_model_wrapper(relevant_tools_prompt, response_type="json_object")
|
||||
with timer("Chat actor: Infer information sources to refer", logger):
|
||||
response = await send_message_to_model_wrapper(relevant_tools_prompt, response_type="json_object")
|
||||
|
||||
try:
|
||||
response = response.strip()
|
||||
|
@ -211,7 +213,8 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
|
|||
chat_history=chat_history,
|
||||
)
|
||||
|
||||
response = await send_message_to_model_wrapper(relevant_mode_prompt)
|
||||
with timer("Chat actor: Infer output mode for chat response", logger):
|
||||
response = await send_message_to_model_wrapper(relevant_mode_prompt)
|
||||
|
||||
try:
|
||||
response = response.strip()
|
||||
|
@ -229,6 +232,36 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict):
|
|||
return ConversationCommand.Default
|
||||
|
||||
|
||||
async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
|
||||
"""
|
||||
Infer webpage links from the given query
|
||||
"""
|
||||
location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
|
||||
chat_history = construct_chat_history(conversation_history)
|
||||
|
||||
utc_date = datetime.utcnow().strftime("%Y-%m-%d")
|
||||
online_queries_prompt = prompts.infer_webpages_to_read.format(
|
||||
current_date=utc_date,
|
||||
query=q,
|
||||
chat_history=chat_history,
|
||||
location=location,
|
||||
)
|
||||
|
||||
with timer("Chat actor: Infer webpage urls to read", logger):
|
||||
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
|
||||
|
||||
# Validate that the response is a non-empty, JSON-serializable list of URLs
|
||||
try:
|
||||
response = response.strip()
|
||||
urls = json.loads(response)
|
||||
valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
|
||||
if is_none_or_empty(valid_unique_urls):
|
||||
raise ValueError(f"Invalid list of urls: {response}")
|
||||
return list(valid_unique_urls)
|
||||
except Exception:
|
||||
raise ValueError(f"Invalid list of urls: {response}")
|
||||
|
||||
|
||||
async def generate_online_subqueries(q: str, conversation_history: dict, location_data: LocationData) -> List[str]:
|
||||
"""
|
||||
Generate subqueries from the given query
|
||||
|
@ -244,7 +277,8 @@ async def generate_online_subqueries(q: str, conversation_history: dict, locatio
|
|||
location=location,
|
||||
)
|
||||
|
||||
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
|
||||
with timer("Chat actor: Generate online search subqueries", logger):
|
||||
response = await send_message_to_model_wrapper(online_queries_prompt, response_type="json_object")
|
||||
|
||||
# Validate that the response is a non-empty, JSON-serializable list
|
||||
try:
|
||||
|
@ -273,9 +307,10 @@ async def extract_relevant_info(q: str, corpus: str) -> Union[str, None]:
|
|||
corpus=corpus.strip(),
|
||||
)
|
||||
|
||||
response = await send_message_to_model_wrapper(
|
||||
extract_relevant_information, prompts.system_prompt_extract_relevant_information
|
||||
)
|
||||
with timer("Chat actor: Extract relevant information from data", logger):
|
||||
response = await send_message_to_model_wrapper(
|
||||
extract_relevant_information, prompts.system_prompt_extract_relevant_information
|
||||
)
|
||||
|
||||
return response.strip()
|
||||
|
||||
|
@ -304,8 +339,8 @@ async def generate_better_image_prompt(
|
|||
for result in online_results:
|
||||
if online_results[result].get("answerBox"):
|
||||
simplified_online_results[result] = online_results[result]["answerBox"]
|
||||
elif online_results[result].get("extracted_content"):
|
||||
simplified_online_results[result] = online_results[result]["extracted_content"]
|
||||
elif online_results[result].get("webpages"):
|
||||
simplified_online_results[result] = online_results[result]["webpages"]
|
||||
|
||||
image_prompt = prompts.image_generation_improve_prompt.format(
|
||||
query=q,
|
||||
|
@ -316,7 +351,8 @@ async def generate_better_image_prompt(
|
|||
online_results=simplified_online_results,
|
||||
)
|
||||
|
||||
response = await send_message_to_model_wrapper(image_prompt)
|
||||
with timer("Chat actor: Generate contextual image prompt", logger):
|
||||
response = await send_message_to_model_wrapper(image_prompt)
|
||||
|
||||
return response.strip()
|
||||
|
||||
|
@ -365,7 +401,7 @@ def generate_chat_response(
|
|||
q: str,
|
||||
meta_log: dict,
|
||||
compiled_references: List[str] = [],
|
||||
online_results: Dict[str, Any] = {},
|
||||
online_results: Dict[str, Dict] = {},
|
||||
inferred_queries: List[str] = [],
|
||||
conversation_commands: List[ConversationCommand] = [ConversationCommand.Default],
|
||||
user: KhojUser = None,
|
||||
|
|
|
@ -15,6 +15,7 @@ from os import path
|
|||
from pathlib import Path
|
||||
from time import perf_counter
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import torch
|
||||
from asgiref.sync import sync_to_async
|
||||
|
@ -270,6 +271,7 @@ class ConversationCommand(str, Enum):
|
|||
Notes = "notes"
|
||||
Help = "help"
|
||||
Online = "online"
|
||||
Webpage = "webpage"
|
||||
Image = "image"
|
||||
|
||||
|
||||
|
@ -278,15 +280,17 @@ command_descriptions = {
|
|||
ConversationCommand.Notes: "Only talk about information that is available in your knowledge base.",
|
||||
ConversationCommand.Default: "The default command when no command specified. It intelligently auto-switches between general and notes mode.",
|
||||
ConversationCommand.Online: "Search for information on the internet.",
|
||||
ConversationCommand.Webpage: "Get information from webpage links provided by you.",
|
||||
ConversationCommand.Image: "Generate images by describing your imagination in words.",
|
||||
ConversationCommand.Help: "Display a help message with all available commands and other metadata.",
|
||||
}
|
||||
|
||||
tool_descriptions_for_llm = {
|
||||
ConversationCommand.Default: "To use a mix of your internal knowledge and the user's personal knowledge, or if you don't entirely understand the query.",
|
||||
ConversationCommand.General: "Use this when you can answer the question without any outside information or personal knowledge",
|
||||
ConversationCommand.General: "To use when you can answer the question without any outside information or personal knowledge",
|
||||
ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.",
|
||||
ConversationCommand.Online: "To search for the latest, up-to-date information from the internet. Note: **Questions about Khoj should always use this data source**",
|
||||
ConversationCommand.Webpage: "To use if the user has directly provided the webpage urls or you are certain of the webpage urls to read.",
|
||||
}
|
||||
|
||||
mode_descriptions_for_llm = {
|
||||
|
@ -340,3 +344,12 @@ def in_debug_mode():
|
|||
"""Check if Khoj is running in debug mode.
|
||||
Set KHOJ_DEBUG environment variable to true to enable debug mode."""
|
||||
return is_env_var_true("KHOJ_DEBUG")
|
||||
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
"""Check if a string is a valid URL"""
|
||||
try:
|
||||
result = urlparse(url.strip())
|
||||
return all([result.scheme, result.netloc])
|
||||
except:
|
||||
return False
|
||||
|
|
|
@ -7,7 +7,10 @@ import pytest
|
|||
from scipy.stats import linregress
|
||||
|
||||
from khoj.processor.embeddings import EmbeddingsModel
|
||||
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
|
||||
from khoj.processor.tools.online_search import (
|
||||
read_webpage_at_url,
|
||||
read_webpage_with_olostep,
|
||||
)
|
||||
from khoj.utils import helpers
|
||||
|
||||
|
||||
|
@ -90,7 +93,7 @@ async def test_reading_webpage():
|
|||
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
||||
|
||||
# Act
|
||||
response = await read_webpage(website)
|
||||
response = await read_webpage_at_url(website)
|
||||
|
||||
# Assert
|
||||
assert (
|
||||
|
|
|
@ -11,6 +11,7 @@ from khoj.routers.helpers import (
|
|||
aget_relevant_information_sources,
|
||||
aget_relevant_output_modes,
|
||||
generate_online_subqueries,
|
||||
infer_webpage_urls,
|
||||
)
|
||||
from khoj.utils.helpers import ConversationCommand
|
||||
|
||||
|
@ -510,6 +511,34 @@ async def test_select_data_sources_actor_chooses_to_search_online(chat_client):
|
|||
assert ConversationCommand.Online in conversation_commands
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.anyio
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
async def test_select_data_sources_actor_chooses_to_read_webpage(chat_client):
|
||||
# Arrange
|
||||
user_query = "Summarize the wikipedia page on the history of the internet"
|
||||
|
||||
# Act
|
||||
conversation_commands = await aget_relevant_information_sources(user_query, {})
|
||||
|
||||
# Assert
|
||||
assert ConversationCommand.Webpage in conversation_commands
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.anyio
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
async def test_infer_webpage_urls_actor_extracts_correct_links(chat_client):
|
||||
# Arrange
|
||||
user_query = "Summarize the wikipedia page on the history of the internet"
|
||||
|
||||
# Act
|
||||
urls = await infer_webpage_urls(user_query, {}, None)
|
||||
|
||||
# Assert
|
||||
assert "https://en.wikipedia.org/wiki/History_of_the_Internet" in urls
|
||||
|
||||
|
||||
# Helpers
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def populate_chat_history(message_list):
|
||||
|
|
Loading…
Add table
Reference in a new issue