Fallback through enabled scrapers to reduce web page read failures

- Set up scrapers via API keys, explicitly adding them via admin panel
  or enabling only a single scraper to use via server chat settings.

- Use validation to ensure only valid scrapers added via admin panel
  Example API key is present for scrapers that require it etc.

- Modularize the read webpage functions to take api key, url as args
  Removes dependence on constants loaded in online_search. Functions
  are now mostly self contained

- Improve ability to read webpages by using the speed, success rate of
  different scrapers. Optimal configuration needs to be discovered
This commit is contained in:
Debanjum Singh Solanky 2024-10-16 00:37:46 -07:00
parent 11c64791aa
commit d94abba2dc
6 changed files with 196 additions and 76 deletions

View file

@ -1,6 +1,7 @@
import json
import logging
import math
import os
import random
import re
import secrets
@ -10,7 +11,6 @@ from enum import Enum
from typing import Callable, Iterable, List, Optional, Type
import cron_descriptor
import django
from apscheduler.job import Job
from asgiref.sync import sync_to_async
from django.contrib.sessions.backends.db import SessionStore
@ -52,6 +52,7 @@ from khoj.database.models import (
UserTextToImageModelConfig,
UserVoiceModelConfig,
VoiceModelOption,
WebScraper,
)
from khoj.processor.conversation import prompts
from khoj.search_filter.date_filter import DateFilter
@ -1032,17 +1033,43 @@ class ConversationAdapters:
return await ConversationAdapters.aget_default_conversation_config(user)
@staticmethod
async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None):
server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst()
async def aget_server_webscraper():
server_chat_settings = await ServerChatSettings.objects.filter().prefetch_related("web_scraper").afirst()
if server_chat_settings is not None and server_chat_settings.web_scraper is not None:
web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper)
if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or (
web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY
):
return web_scraper
# Fallback to JinaAI if the API keys for the other providers are not set
# JinaAI is the default web scraper as it does not require an API key
return ServerChatSettings.WebScraper.JINAAI
return server_chat_settings.web_scraper
return None
@staticmethod
async def aget_enabled_webscrapers():
enabled_scrapers = []
server_webscraper = await ConversationAdapters.aget_server_webscraper()
if server_webscraper:
# Only use the webscraper set in the server chat settings
enabled_scrapers = [
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
]
if not enabled_scrapers:
# Use the enabled web scrapers, using the newest created scraper first, until get web page content
enabled_scrapers = [
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator()
]
if not enabled_scrapers:
# Use scrapers enabled via environment variables
if os.getenv("FIRECRAWL_API_KEY"):
api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
enabled_scrapers.append(
(WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl")
)
if os.getenv("OLOSTEP_API_KEY"):
api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
enabled_scrapers.append(
(WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep")
)
# Jina is the default fallback scraper to use as it does not require an API key
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina"))
return enabled_scrapers
@staticmethod
def create_conversation_from_public_conversation(

View file

@ -31,6 +31,7 @@ from khoj.database.models import (
UserSearchModelConfig,
UserVoiceModelConfig,
VoiceModelOption,
WebScraper,
)
from khoj.utils.helpers import ImageIntentType
@ -202,6 +203,19 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
)
@admin.register(WebScraper)
class WebScraperAdmin(admin.ModelAdmin):
list_display = (
"name",
"type",
"api_key",
"api_url",
"created_at",
)
search_fields = ("name", "api_key", "api_url", "type")
ordering = ("-created_at",)
@admin.register(Conversation)
class ConversationAdmin(admin.ModelAdmin):
list_display = (

View file

@ -1,21 +0,0 @@
# Generated by Django 5.0.8 on 2024-10-16 00:06
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0067_alter_agent_style_icon"),
]
operations = [
migrations.AddField(
model_name="serverchatsettings",
name="web_scraper",
field=models.CharField(
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")],
default="jinaai",
max_length=20,
),
),
]

View file

@ -0,0 +1,47 @@
# Generated by Django 5.0.8 on 2024-10-16 06:51
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0067_alter_agent_style_icon"),
]
operations = [
migrations.CreateModel(
name="WebScraper",
fields=[
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)),
(
"type",
models.CharField(
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")],
default="jina",
max_length=20,
),
),
("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)),
("api_url", models.URLField(blank=True, default=None, null=True)),
],
options={
"abstract": False,
},
),
migrations.AddField(
model_name="serverchatsettings",
name="web_scraper",
field=models.ForeignKey(
blank=True,
default=None,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="web_scraper",
to="database.webscraper",
),
),
]

View file

@ -1,3 +1,4 @@
import os
import re
import uuid
from random import choice
@ -12,8 +13,6 @@ from django.utils.translation import gettext_lazy
from pgvector.django import VectorField
from phonenumber_field.modelfields import PhoneNumberField
from khoj.utils.helpers import ConversationCommand
class BaseModel(models.Model):
created_at = models.DateTimeField(auto_now_add=True)
@ -245,19 +244,58 @@ class GithubRepoConfig(BaseModel):
github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig")
class ServerChatSettings(BaseModel):
class WebScraper(models.TextChoices):
class WebScraper(BaseModel):
class WebScraperType(models.TextChoices):
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
OLOSTEP = "olostep", gettext_lazy("Olostep")
JINAAI = "jinaai", gettext_lazy("JinaAI")
JINA = "jina", gettext_lazy("Jina")
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
def clean(self):
error = {}
if self.name is None:
self.name = self.type.capitalize()
if self.api_url is None:
if self.type == self.WebScraperType.FIRECRAWL:
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
elif self.type == self.WebScraperType.OLOSTEP:
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
elif self.type == self.WebScraperType.JINA:
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
if self.api_key is None:
if self.type == self.WebScraperType.FIRECRAWL:
self.api_key = os.getenv("FIRECRAWL_API_KEY")
if not self.api_key and self.api_url == "https://api.firecrawl.dev":
error["api_key"] = "Set API key to use default Firecrawl. Get API key from https://firecrawl.dev."
elif self.type == self.WebScraperType.OLOSTEP:
self.api_key = os.getenv("OLOSTEP_API_KEY")
if self.api_key is None:
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
elif self.type == self.WebScraperType.JINA:
self.api_key = os.getenv("JINA_API_KEY")
if error:
raise ValidationError(error)
def save(self, *args, **kwargs):
self.clean()
super().save(*args, **kwargs)
class ServerChatSettings(BaseModel):
chat_default = models.ForeignKey(
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
)
chat_advanced = models.ForeignKey(
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
)
web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI)
web_scraper = models.ForeignKey(
WebScraper, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="web_scraper"
)
class LocalOrgConfig(BaseModel):

View file

@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
from markdownify import markdownify
from khoj.database.adapters import ConversationAdapters
from khoj.database.models import Agent, KhojUser, ServerChatSettings
from khoj.database.models import Agent, KhojUser, WebScraper
from khoj.processor.conversation import prompts
from khoj.routers.helpers import (
ChatEvent,
@ -27,16 +27,11 @@ logger = logging.getLogger(__name__)
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
SERPER_DEV_URL = "https://google.serper.dev/search"
JINA_READER_API_URL = "https://r.jina.ai/"
JINA_SEARCH_API_URL = "https://s.jina.ai/"
JINA_API_KEY = os.getenv("JINA_API_KEY")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
OLOSTEP_QUERY_PARAMS = {
"timeout": 35, # seconds
"waitBeforeScraping": 1, # seconds
@ -175,29 +170,47 @@ async def read_webpages(
yield response
async def read_webpage(
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
) -> Tuple[str | None, str | None]:
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT:
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
return await read_webpage_with_firecrawl(url, api_key, api_url), None
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
return await read_webpage_with_olostep(url, api_key, api_url), None
else:
return await read_webpage_with_jina(url, api_key, api_url), None
async def read_webpage_and_extract_content(
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
) -> Tuple[set[str], str, Union[None, str]]:
# Select the web scraper to use for reading the web page
web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY)
# Select the web scrapers to use for reading the web page
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
# Fallback through enabled web scrapers until we successfully read the web page
extracted_info = None
for scraper_type, api_key, api_url, api_name in web_scrapers:
try:
# Read the web page
if is_none_or_empty(content):
with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger, log_level=logging.INFO):
if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL:
if FIRECRAWL_TO_EXTRACT:
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent)
else:
content = await read_webpage_with_firecrawl(url)
elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP:
content = await read_webpage_with_olostep(url)
else:
content = await read_webpage_with_jina(url)
with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO):
content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent)
# Extract relevant information from the web page
if is_none_or_empty(extracted_info):
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
# If we successfully extracted information, break the loop
if not is_none_or_empty(extracted_info):
break
except Exception as e:
logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}")
logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}")
# If this is the last web scraper in the list, log an error
if api_name == web_scrapers[-1][-1]:
logger.error(f"All web scrapers failed for '{url}'")
return subqueries, url, extracted_info
@ -216,23 +229,23 @@ async def read_webpage_at_url(web_url: str) -> str:
return markdownify(body)
async def read_webpage_with_olostep(web_url: str) -> str:
headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
headers = {"Authorization": f"Bearer {api_key}"}
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
web_scraping_params["url"] = web_url
async with aiohttp.ClientSession() as session:
async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response:
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
response.raise_for_status()
response_json = await response.json()
return response_json["markdown_content"]
async def read_webpage_with_jina(web_url: str) -> str:
jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}"
async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
jina_reader_api_url = f"{api_url}/{web_url}"
headers = {"Accept": "application/json", "X-Timeout": "30"}
if JINA_API_KEY:
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
async with aiohttp.ClientSession() as session:
async with session.get(jina_reader_api_url, headers=headers) as response:
@ -241,9 +254,9 @@ async def read_webpage_with_jina(web_url: str) -> str:
return response_json["data"]["content"]
async def read_webpage_with_firecrawl(web_url: str) -> str:
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
firecrawl_api_url = f"{api_url}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
async with aiohttp.ClientSession() as session:
@ -253,9 +266,11 @@ async def read_webpage_with_firecrawl(web_url: str) -> str:
return response_json["data"]["markdown"]
async def read_webpage_and_extract_content_with_firecrawl(web_url: str, queries: set[str], agent: Agent = None) -> str:
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
async def query_webpage_with_firecrawl(
web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
) -> str:
firecrawl_api_url = f"{api_url}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
schema = {
"type": "object",
"properties": {