Fallback through enabled scrapers to reduce web page read failures

- Set up scrapers via API keys, explicitly adding them via admin panel
  or enabling only a single scraper to use via server chat settings.

- Use validation to ensure only valid scrapers added via admin panel
  Example API key is present for scrapers that require it etc.

- Modularize the read webpage functions to take api key, url as args
  Removes dependence on constants loaded in online_search. Functions
  are now mostly self contained

- Improve ability to read webpages by using the speed, success rate of
  different scrapers. Optimal configuration needs to be discovered
This commit is contained in:
Debanjum Singh Solanky 2024-10-16 00:37:46 -07:00
parent 11c64791aa
commit d94abba2dc
6 changed files with 196 additions and 76 deletions

View file

@ -1,6 +1,7 @@
import json import json
import logging import logging
import math import math
import os
import random import random
import re import re
import secrets import secrets
@ -10,7 +11,6 @@ from enum import Enum
from typing import Callable, Iterable, List, Optional, Type from typing import Callable, Iterable, List, Optional, Type
import cron_descriptor import cron_descriptor
import django
from apscheduler.job import Job from apscheduler.job import Job
from asgiref.sync import sync_to_async from asgiref.sync import sync_to_async
from django.contrib.sessions.backends.db import SessionStore from django.contrib.sessions.backends.db import SessionStore
@ -52,6 +52,7 @@ from khoj.database.models import (
UserTextToImageModelConfig, UserTextToImageModelConfig,
UserVoiceModelConfig, UserVoiceModelConfig,
VoiceModelOption, VoiceModelOption,
WebScraper,
) )
from khoj.processor.conversation import prompts from khoj.processor.conversation import prompts
from khoj.search_filter.date_filter import DateFilter from khoj.search_filter.date_filter import DateFilter
@ -1032,17 +1033,43 @@ class ConversationAdapters:
return await ConversationAdapters.aget_default_conversation_config(user) return await ConversationAdapters.aget_default_conversation_config(user)
@staticmethod @staticmethod
async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None): async def aget_server_webscraper():
server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst() server_chat_settings = await ServerChatSettings.objects.filter().prefetch_related("web_scraper").afirst()
if server_chat_settings is not None and server_chat_settings.web_scraper is not None: if server_chat_settings is not None and server_chat_settings.web_scraper is not None:
web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper) return server_chat_settings.web_scraper
if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or ( return None
web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY
): @staticmethod
return web_scraper async def aget_enabled_webscrapers():
# Fallback to JinaAI if the API keys for the other providers are not set enabled_scrapers = []
# JinaAI is the default web scraper as it does not require an API key server_webscraper = await ConversationAdapters.aget_server_webscraper()
return ServerChatSettings.WebScraper.JINAAI if server_webscraper:
# Only use the webscraper set in the server chat settings
enabled_scrapers = [
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
]
if not enabled_scrapers:
# Use the enabled web scrapers, using the newest created scraper first, until get web page content
enabled_scrapers = [
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator()
]
if not enabled_scrapers:
# Use scrapers enabled via environment variables
if os.getenv("FIRECRAWL_API_KEY"):
api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
enabled_scrapers.append(
(WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl")
)
if os.getenv("OLOSTEP_API_KEY"):
api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
enabled_scrapers.append(
(WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep")
)
# Jina is the default fallback scraper to use as it does not require an API key
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina"))
return enabled_scrapers
@staticmethod @staticmethod
def create_conversation_from_public_conversation( def create_conversation_from_public_conversation(

View file

@ -31,6 +31,7 @@ from khoj.database.models import (
UserSearchModelConfig, UserSearchModelConfig,
UserVoiceModelConfig, UserVoiceModelConfig,
VoiceModelOption, VoiceModelOption,
WebScraper,
) )
from khoj.utils.helpers import ImageIntentType from khoj.utils.helpers import ImageIntentType
@ -202,6 +203,19 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
) )
@admin.register(WebScraper)
class WebScraperAdmin(admin.ModelAdmin):
list_display = (
"name",
"type",
"api_key",
"api_url",
"created_at",
)
search_fields = ("name", "api_key", "api_url", "type")
ordering = ("-created_at",)
@admin.register(Conversation) @admin.register(Conversation)
class ConversationAdmin(admin.ModelAdmin): class ConversationAdmin(admin.ModelAdmin):
list_display = ( list_display = (

View file

@ -1,21 +0,0 @@
# Generated by Django 5.0.8 on 2024-10-16 00:06
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0067_alter_agent_style_icon"),
]
operations = [
migrations.AddField(
model_name="serverchatsettings",
name="web_scraper",
field=models.CharField(
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")],
default="jinaai",
max_length=20,
),
),
]

View file

@ -0,0 +1,47 @@
# Generated by Django 5.0.8 on 2024-10-16 06:51
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0067_alter_agent_style_icon"),
]
operations = [
migrations.CreateModel(
name="WebScraper",
fields=[
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)),
(
"type",
models.CharField(
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")],
default="jina",
max_length=20,
),
),
("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)),
("api_url", models.URLField(blank=True, default=None, null=True)),
],
options={
"abstract": False,
},
),
migrations.AddField(
model_name="serverchatsettings",
name="web_scraper",
field=models.ForeignKey(
blank=True,
default=None,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="web_scraper",
to="database.webscraper",
),
),
]

View file

@ -1,3 +1,4 @@
import os
import re import re
import uuid import uuid
from random import choice from random import choice
@ -12,8 +13,6 @@ from django.utils.translation import gettext_lazy
from pgvector.django import VectorField from pgvector.django import VectorField
from phonenumber_field.modelfields import PhoneNumberField from phonenumber_field.modelfields import PhoneNumberField
from khoj.utils.helpers import ConversationCommand
class BaseModel(models.Model): class BaseModel(models.Model):
created_at = models.DateTimeField(auto_now_add=True) created_at = models.DateTimeField(auto_now_add=True)
@ -245,19 +244,58 @@ class GithubRepoConfig(BaseModel):
github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig") github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig")
class ServerChatSettings(BaseModel): class WebScraper(BaseModel):
class WebScraper(models.TextChoices): class WebScraperType(models.TextChoices):
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
OLOSTEP = "olostep", gettext_lazy("Olostep") OLOSTEP = "olostep", gettext_lazy("Olostep")
JINAAI = "jinaai", gettext_lazy("JinaAI") JINA = "jina", gettext_lazy("Jina")
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
def clean(self):
error = {}
if self.name is None:
self.name = self.type.capitalize()
if self.api_url is None:
if self.type == self.WebScraperType.FIRECRAWL:
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
elif self.type == self.WebScraperType.OLOSTEP:
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
elif self.type == self.WebScraperType.JINA:
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
if self.api_key is None:
if self.type == self.WebScraperType.FIRECRAWL:
self.api_key = os.getenv("FIRECRAWL_API_KEY")
if not self.api_key and self.api_url == "https://api.firecrawl.dev":
error["api_key"] = "Set API key to use default Firecrawl. Get API key from https://firecrawl.dev."
elif self.type == self.WebScraperType.OLOSTEP:
self.api_key = os.getenv("OLOSTEP_API_KEY")
if self.api_key is None:
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
elif self.type == self.WebScraperType.JINA:
self.api_key = os.getenv("JINA_API_KEY")
if error:
raise ValidationError(error)
def save(self, *args, **kwargs):
self.clean()
super().save(*args, **kwargs)
class ServerChatSettings(BaseModel):
chat_default = models.ForeignKey( chat_default = models.ForeignKey(
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default" ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
) )
chat_advanced = models.ForeignKey( chat_advanced = models.ForeignKey(
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced" ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
) )
web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI) web_scraper = models.ForeignKey(
WebScraper, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="web_scraper"
)
class LocalOrgConfig(BaseModel): class LocalOrgConfig(BaseModel):

View file

@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
from markdownify import markdownify from markdownify import markdownify
from khoj.database.adapters import ConversationAdapters from khoj.database.adapters import ConversationAdapters
from khoj.database.models import Agent, KhojUser, ServerChatSettings from khoj.database.models import Agent, KhojUser, WebScraper
from khoj.processor.conversation import prompts from khoj.processor.conversation import prompts
from khoj.routers.helpers import ( from khoj.routers.helpers import (
ChatEvent, ChatEvent,
@ -27,16 +27,11 @@ logger = logging.getLogger(__name__)
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY") SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
SERPER_DEV_URL = "https://google.serper.dev/search" SERPER_DEV_URL = "https://google.serper.dev/search"
JINA_READER_API_URL = "https://r.jina.ai/"
JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_SEARCH_API_URL = "https://s.jina.ai/"
JINA_API_KEY = os.getenv("JINA_API_KEY") JINA_API_KEY = os.getenv("JINA_API_KEY")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true" FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
OLOSTEP_QUERY_PARAMS = { OLOSTEP_QUERY_PARAMS = {
"timeout": 35, # seconds "timeout": 35, # seconds
"waitBeforeScraping": 1, # seconds "waitBeforeScraping": 1, # seconds
@ -175,29 +170,47 @@ async def read_webpages(
yield response yield response
async def read_webpage(
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
) -> Tuple[str | None, str | None]:
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT:
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
return await read_webpage_with_firecrawl(url, api_key, api_url), None
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
return await read_webpage_with_olostep(url, api_key, api_url), None
else:
return await read_webpage_with_jina(url, api_key, api_url), None
async def read_webpage_and_extract_content( async def read_webpage_and_extract_content(
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
) -> Tuple[set[str], str, Union[None, str]]: ) -> Tuple[set[str], str, Union[None, str]]:
# Select the web scraper to use for reading the web page # Select the web scrapers to use for reading the web page
web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY) web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
# Fallback through enabled web scrapers until we successfully read the web page
extracted_info = None extracted_info = None
try: for scraper_type, api_key, api_url, api_name in web_scrapers:
if is_none_or_empty(content): try:
with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger, log_level=logging.INFO): # Read the web page
if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL: if is_none_or_empty(content):
if FIRECRAWL_TO_EXTRACT: with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO):
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent) content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent)
else:
content = await read_webpage_with_firecrawl(url) # Extract relevant information from the web page
elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP: if is_none_or_empty(extracted_info):
content = await read_webpage_with_olostep(url) with timer(f"Extracting relevant information from web page at '{url}' took", logger):
else: extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
content = await read_webpage_with_jina(url)
if is_none_or_empty(extracted_info): # If we successfully extracted information, break the loop
with timer(f"Extracting relevant information from web page at '{url}' took", logger): if not is_none_or_empty(extracted_info):
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent) break
except Exception as e: except Exception as e:
logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}") logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}")
# If this is the last web scraper in the list, log an error
if api_name == web_scrapers[-1][-1]:
logger.error(f"All web scrapers failed for '{url}'")
return subqueries, url, extracted_info return subqueries, url, extracted_info
@ -216,23 +229,23 @@ async def read_webpage_at_url(web_url: str) -> str:
return markdownify(body) return markdownify(body)
async def read_webpage_with_olostep(web_url: str) -> str: async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"} headers = {"Authorization": f"Bearer {api_key}"}
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
web_scraping_params["url"] = web_url web_scraping_params["url"] = web_url
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response: async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
response.raise_for_status() response.raise_for_status()
response_json = await response.json() response_json = await response.json()
return response_json["markdown_content"] return response_json["markdown_content"]
async def read_webpage_with_jina(web_url: str) -> str: async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}" jina_reader_api_url = f"{api_url}/{web_url}"
headers = {"Accept": "application/json", "X-Timeout": "30"} headers = {"Accept": "application/json", "X-Timeout": "30"}
if JINA_API_KEY: if api_key:
headers["Authorization"] = f"Bearer {JINA_API_KEY}" headers["Authorization"] = f"Bearer {api_key}"
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(jina_reader_api_url, headers=headers) as response: async with session.get(jina_reader_api_url, headers=headers) as response:
@ -241,9 +254,9 @@ async def read_webpage_with_jina(web_url: str) -> str:
return response_json["data"]["content"] return response_json["data"]["content"]
async def read_webpage_with_firecrawl(web_url: str) -> str: async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" firecrawl_api_url = f"{api_url}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]} params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
@ -253,9 +266,11 @@ async def read_webpage_with_firecrawl(web_url: str) -> str:
return response_json["data"]["markdown"] return response_json["data"]["markdown"]
async def read_webpage_and_extract_content_with_firecrawl(web_url: str, queries: set[str], agent: Agent = None) -> str: async def query_webpage_with_firecrawl(
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} ) -> str:
firecrawl_api_url = f"{api_url}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
schema = { schema = {
"type": "object", "type": "object",
"properties": { "properties": {