mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Fallback through enabled scrapers to reduce web page read failures
- Set up scrapers via API keys, explicitly adding them via admin panel or enabling only a single scraper to use via server chat settings. - Use validation to ensure only valid scrapers added via admin panel Example API key is present for scrapers that require it etc. - Modularize the read webpage functions to take api key, url as args Removes dependence on constants loaded in online_search. Functions are now mostly self contained - Improve ability to read webpages by using the speed, success rate of different scrapers. Optimal configuration needs to be discovered
This commit is contained in:
parent
11c64791aa
commit
d94abba2dc
6 changed files with 196 additions and 76 deletions
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import secrets
|
||||
|
@ -10,7 +11,6 @@ from enum import Enum
|
|||
from typing import Callable, Iterable, List, Optional, Type
|
||||
|
||||
import cron_descriptor
|
||||
import django
|
||||
from apscheduler.job import Job
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.contrib.sessions.backends.db import SessionStore
|
||||
|
@ -52,6 +52,7 @@ from khoj.database.models import (
|
|||
UserTextToImageModelConfig,
|
||||
UserVoiceModelConfig,
|
||||
VoiceModelOption,
|
||||
WebScraper,
|
||||
)
|
||||
from khoj.processor.conversation import prompts
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
|
@ -1032,17 +1033,43 @@ class ConversationAdapters:
|
|||
return await ConversationAdapters.aget_default_conversation_config(user)
|
||||
|
||||
@staticmethod
|
||||
async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None):
|
||||
server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst()
|
||||
async def aget_server_webscraper():
|
||||
server_chat_settings = await ServerChatSettings.objects.filter().prefetch_related("web_scraper").afirst()
|
||||
if server_chat_settings is not None and server_chat_settings.web_scraper is not None:
|
||||
web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper)
|
||||
if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or (
|
||||
web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY
|
||||
):
|
||||
return web_scraper
|
||||
# Fallback to JinaAI if the API keys for the other providers are not set
|
||||
# JinaAI is the default web scraper as it does not require an API key
|
||||
return ServerChatSettings.WebScraper.JINAAI
|
||||
return server_chat_settings.web_scraper
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
async def aget_enabled_webscrapers():
|
||||
enabled_scrapers = []
|
||||
server_webscraper = await ConversationAdapters.aget_server_webscraper()
|
||||
if server_webscraper:
|
||||
# Only use the webscraper set in the server chat settings
|
||||
enabled_scrapers = [
|
||||
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
|
||||
]
|
||||
if not enabled_scrapers:
|
||||
# Use the enabled web scrapers, using the newest created scraper first, until get web page content
|
||||
enabled_scrapers = [
|
||||
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
|
||||
async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator()
|
||||
]
|
||||
if not enabled_scrapers:
|
||||
# Use scrapers enabled via environment variables
|
||||
if os.getenv("FIRECRAWL_API_KEY"):
|
||||
api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
enabled_scrapers.append(
|
||||
(WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl")
|
||||
)
|
||||
if os.getenv("OLOSTEP_API_KEY"):
|
||||
api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||
enabled_scrapers.append(
|
||||
(WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep")
|
||||
)
|
||||
# Jina is the default fallback scraper to use as it does not require an API key
|
||||
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||
enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina"))
|
||||
return enabled_scrapers
|
||||
|
||||
@staticmethod
|
||||
def create_conversation_from_public_conversation(
|
||||
|
|
|
@ -31,6 +31,7 @@ from khoj.database.models import (
|
|||
UserSearchModelConfig,
|
||||
UserVoiceModelConfig,
|
||||
VoiceModelOption,
|
||||
WebScraper,
|
||||
)
|
||||
from khoj.utils.helpers import ImageIntentType
|
||||
|
||||
|
@ -202,6 +203,19 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
|
|||
)
|
||||
|
||||
|
||||
@admin.register(WebScraper)
|
||||
class WebScraperAdmin(admin.ModelAdmin):
|
||||
list_display = (
|
||||
"name",
|
||||
"type",
|
||||
"api_key",
|
||||
"api_url",
|
||||
"created_at",
|
||||
)
|
||||
search_fields = ("name", "api_key", "api_url", "type")
|
||||
ordering = ("-created_at",)
|
||||
|
||||
|
||||
@admin.register(Conversation)
|
||||
class ConversationAdmin(admin.ModelAdmin):
|
||||
list_display = (
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
# Generated by Django 5.0.8 on 2024-10-16 00:06
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("database", "0067_alter_agent_style_icon"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="serverchatsettings",
|
||||
name="web_scraper",
|
||||
field=models.CharField(
|
||||
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")],
|
||||
default="jinaai",
|
||||
max_length=20,
|
||||
),
|
||||
),
|
||||
]
|
|
@ -0,0 +1,47 @@
|
|||
# Generated by Django 5.0.8 on 2024-10-16 06:51
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("database", "0067_alter_agent_style_icon"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="WebScraper",
|
||||
fields=[
|
||||
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||
("created_at", models.DateTimeField(auto_now_add=True)),
|
||||
("updated_at", models.DateTimeField(auto_now=True)),
|
||||
("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)),
|
||||
(
|
||||
"type",
|
||||
models.CharField(
|
||||
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")],
|
||||
default="jina",
|
||||
max_length=20,
|
||||
),
|
||||
),
|
||||
("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)),
|
||||
("api_url", models.URLField(blank=True, default=None, null=True)),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="serverchatsettings",
|
||||
name="web_scraper",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
default=None,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="web_scraper",
|
||||
to="database.webscraper",
|
||||
),
|
||||
),
|
||||
]
|
|
@ -1,3 +1,4 @@
|
|||
import os
|
||||
import re
|
||||
import uuid
|
||||
from random import choice
|
||||
|
@ -12,8 +13,6 @@ from django.utils.translation import gettext_lazy
|
|||
from pgvector.django import VectorField
|
||||
from phonenumber_field.modelfields import PhoneNumberField
|
||||
|
||||
from khoj.utils.helpers import ConversationCommand
|
||||
|
||||
|
||||
class BaseModel(models.Model):
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
|
@ -245,19 +244,58 @@ class GithubRepoConfig(BaseModel):
|
|||
github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig")
|
||||
|
||||
|
||||
class ServerChatSettings(BaseModel):
|
||||
class WebScraper(models.TextChoices):
|
||||
class WebScraper(BaseModel):
|
||||
class WebScraperType(models.TextChoices):
|
||||
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
|
||||
OLOSTEP = "olostep", gettext_lazy("Olostep")
|
||||
JINAAI = "jinaai", gettext_lazy("JinaAI")
|
||||
JINA = "jina", gettext_lazy("Jina")
|
||||
|
||||
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
|
||||
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
|
||||
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
|
||||
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
|
||||
|
||||
def clean(self):
|
||||
error = {}
|
||||
if self.name is None:
|
||||
self.name = self.type.capitalize()
|
||||
if self.api_url is None:
|
||||
if self.type == self.WebScraperType.FIRECRAWL:
|
||||
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
elif self.type == self.WebScraperType.OLOSTEP:
|
||||
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||
if self.api_key is None:
|
||||
if self.type == self.WebScraperType.FIRECRAWL:
|
||||
self.api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
if not self.api_key and self.api_url == "https://api.firecrawl.dev":
|
||||
error["api_key"] = "Set API key to use default Firecrawl. Get API key from https://firecrawl.dev."
|
||||
elif self.type == self.WebScraperType.OLOSTEP:
|
||||
self.api_key = os.getenv("OLOSTEP_API_KEY")
|
||||
if self.api_key is None:
|
||||
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_key = os.getenv("JINA_API_KEY")
|
||||
|
||||
if error:
|
||||
raise ValidationError(error)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
self.clean()
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
|
||||
class ServerChatSettings(BaseModel):
|
||||
chat_default = models.ForeignKey(
|
||||
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
|
||||
)
|
||||
chat_advanced = models.ForeignKey(
|
||||
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
|
||||
)
|
||||
web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI)
|
||||
web_scraper = models.ForeignKey(
|
||||
WebScraper, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="web_scraper"
|
||||
)
|
||||
|
||||
|
||||
class LocalOrgConfig(BaseModel):
|
||||
|
|
|
@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
|
|||
from markdownify import markdownify
|
||||
|
||||
from khoj.database.adapters import ConversationAdapters
|
||||
from khoj.database.models import Agent, KhojUser, ServerChatSettings
|
||||
from khoj.database.models import Agent, KhojUser, WebScraper
|
||||
from khoj.processor.conversation import prompts
|
||||
from khoj.routers.helpers import (
|
||||
ChatEvent,
|
||||
|
@ -27,16 +27,11 @@ logger = logging.getLogger(__name__)
|
|||
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
|
||||
SERPER_DEV_URL = "https://google.serper.dev/search"
|
||||
|
||||
JINA_READER_API_URL = "https://r.jina.ai/"
|
||||
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
||||
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
||||
|
||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
||||
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
|
||||
|
||||
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
|
||||
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
|
||||
OLOSTEP_QUERY_PARAMS = {
|
||||
"timeout": 35, # seconds
|
||||
"waitBeforeScraping": 1, # seconds
|
||||
|
@ -175,29 +170,47 @@ async def read_webpages(
|
|||
yield response
|
||||
|
||||
|
||||
async def read_webpage(
|
||||
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
||||
) -> Tuple[str | None, str | None]:
|
||||
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT:
|
||||
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
||||
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
||||
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
||||
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
||||
return await read_webpage_with_olostep(url, api_key, api_url), None
|
||||
else:
|
||||
return await read_webpage_with_jina(url, api_key, api_url), None
|
||||
|
||||
|
||||
async def read_webpage_and_extract_content(
|
||||
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
||||
) -> Tuple[set[str], str, Union[None, str]]:
|
||||
# Select the web scraper to use for reading the web page
|
||||
web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY)
|
||||
# Select the web scrapers to use for reading the web page
|
||||
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
|
||||
|
||||
# Fallback through enabled web scrapers until we successfully read the web page
|
||||
extracted_info = None
|
||||
for scraper_type, api_key, api_url, api_name in web_scrapers:
|
||||
try:
|
||||
# Read the web page
|
||||
if is_none_or_empty(content):
|
||||
with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger, log_level=logging.INFO):
|
||||
if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL:
|
||||
if FIRECRAWL_TO_EXTRACT:
|
||||
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent)
|
||||
else:
|
||||
content = await read_webpage_with_firecrawl(url)
|
||||
elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP:
|
||||
content = await read_webpage_with_olostep(url)
|
||||
else:
|
||||
content = await read_webpage_with_jina(url)
|
||||
with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO):
|
||||
content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent)
|
||||
|
||||
# Extract relevant information from the web page
|
||||
if is_none_or_empty(extracted_info):
|
||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
||||
|
||||
# If we successfully extracted information, break the loop
|
||||
if not is_none_or_empty(extracted_info):
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}")
|
||||
logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}")
|
||||
# If this is the last web scraper in the list, log an error
|
||||
if api_name == web_scrapers[-1][-1]:
|
||||
logger.error(f"All web scrapers failed for '{url}'")
|
||||
|
||||
return subqueries, url, extracted_info
|
||||
|
||||
|
@ -216,23 +229,23 @@ async def read_webpage_at_url(web_url: str) -> str:
|
|||
return markdownify(body)
|
||||
|
||||
|
||||
async def read_webpage_with_olostep(web_url: str) -> str:
|
||||
headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
|
||||
async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
||||
web_scraping_params["url"] = web_url
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response:
|
||||
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["markdown_content"]
|
||||
|
||||
|
||||
async def read_webpage_with_jina(web_url: str) -> str:
|
||||
jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}"
|
||||
async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
|
||||
jina_reader_api_url = f"{api_url}/{web_url}"
|
||||
headers = {"Accept": "application/json", "X-Timeout": "30"}
|
||||
if JINA_API_KEY:
|
||||
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(jina_reader_api_url, headers=headers) as response:
|
||||
|
@ -241,9 +254,9 @@ async def read_webpage_with_jina(web_url: str) -> str:
|
|||
return response_json["data"]["content"]
|
||||
|
||||
|
||||
async def read_webpage_with_firecrawl(web_url: str) -> str:
|
||||
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
||||
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
||||
firecrawl_api_url = f"{api_url}/v1/scrape"
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
|
@ -253,9 +266,11 @@ async def read_webpage_with_firecrawl(web_url: str) -> str:
|
|||
return response_json["data"]["markdown"]
|
||||
|
||||
|
||||
async def read_webpage_and_extract_content_with_firecrawl(web_url: str, queries: set[str], agent: Agent = None) -> str:
|
||||
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
||||
async def query_webpage_with_firecrawl(
|
||||
web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
|
||||
) -> str:
|
||||
firecrawl_api_url = f"{api_url}/v1/scrape"
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
Loading…
Reference in a new issue