mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Fallback through enabled scrapers to reduce web page read failures
- Set up scrapers via API keys, explicitly adding them via admin panel or enabling only a single scraper to use via server chat settings. - Use validation to ensure only valid scrapers added via admin panel Example API key is present for scrapers that require it etc. - Modularize the read webpage functions to take api key, url as args Removes dependence on constants loaded in online_search. Functions are now mostly self contained - Improve ability to read webpages by using the speed, success rate of different scrapers. Optimal configuration needs to be discovered
This commit is contained in:
parent
11c64791aa
commit
d94abba2dc
6 changed files with 196 additions and 76 deletions
|
@ -1,6 +1,7 @@
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import secrets
|
import secrets
|
||||||
|
@ -10,7 +11,6 @@ from enum import Enum
|
||||||
from typing import Callable, Iterable, List, Optional, Type
|
from typing import Callable, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import cron_descriptor
|
import cron_descriptor
|
||||||
import django
|
|
||||||
from apscheduler.job import Job
|
from apscheduler.job import Job
|
||||||
from asgiref.sync import sync_to_async
|
from asgiref.sync import sync_to_async
|
||||||
from django.contrib.sessions.backends.db import SessionStore
|
from django.contrib.sessions.backends.db import SessionStore
|
||||||
|
@ -52,6 +52,7 @@ from khoj.database.models import (
|
||||||
UserTextToImageModelConfig,
|
UserTextToImageModelConfig,
|
||||||
UserVoiceModelConfig,
|
UserVoiceModelConfig,
|
||||||
VoiceModelOption,
|
VoiceModelOption,
|
||||||
|
WebScraper,
|
||||||
)
|
)
|
||||||
from khoj.processor.conversation import prompts
|
from khoj.processor.conversation import prompts
|
||||||
from khoj.search_filter.date_filter import DateFilter
|
from khoj.search_filter.date_filter import DateFilter
|
||||||
|
@ -1032,17 +1033,43 @@ class ConversationAdapters:
|
||||||
return await ConversationAdapters.aget_default_conversation_config(user)
|
return await ConversationAdapters.aget_default_conversation_config(user)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None):
|
async def aget_server_webscraper():
|
||||||
server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst()
|
server_chat_settings = await ServerChatSettings.objects.filter().prefetch_related("web_scraper").afirst()
|
||||||
if server_chat_settings is not None and server_chat_settings.web_scraper is not None:
|
if server_chat_settings is not None and server_chat_settings.web_scraper is not None:
|
||||||
web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper)
|
return server_chat_settings.web_scraper
|
||||||
if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or (
|
return None
|
||||||
web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY
|
|
||||||
):
|
@staticmethod
|
||||||
return web_scraper
|
async def aget_enabled_webscrapers():
|
||||||
# Fallback to JinaAI if the API keys for the other providers are not set
|
enabled_scrapers = []
|
||||||
# JinaAI is the default web scraper as it does not require an API key
|
server_webscraper = await ConversationAdapters.aget_server_webscraper()
|
||||||
return ServerChatSettings.WebScraper.JINAAI
|
if server_webscraper:
|
||||||
|
# Only use the webscraper set in the server chat settings
|
||||||
|
enabled_scrapers = [
|
||||||
|
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
|
||||||
|
]
|
||||||
|
if not enabled_scrapers:
|
||||||
|
# Use the enabled web scrapers, using the newest created scraper first, until get web page content
|
||||||
|
enabled_scrapers = [
|
||||||
|
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
|
||||||
|
async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator()
|
||||||
|
]
|
||||||
|
if not enabled_scrapers:
|
||||||
|
# Use scrapers enabled via environment variables
|
||||||
|
if os.getenv("FIRECRAWL_API_KEY"):
|
||||||
|
api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||||
|
enabled_scrapers.append(
|
||||||
|
(WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl")
|
||||||
|
)
|
||||||
|
if os.getenv("OLOSTEP_API_KEY"):
|
||||||
|
api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||||
|
enabled_scrapers.append(
|
||||||
|
(WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep")
|
||||||
|
)
|
||||||
|
# Jina is the default fallback scraper to use as it does not require an API key
|
||||||
|
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||||
|
enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina"))
|
||||||
|
return enabled_scrapers
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_conversation_from_public_conversation(
|
def create_conversation_from_public_conversation(
|
||||||
|
|
|
@ -31,6 +31,7 @@ from khoj.database.models import (
|
||||||
UserSearchModelConfig,
|
UserSearchModelConfig,
|
||||||
UserVoiceModelConfig,
|
UserVoiceModelConfig,
|
||||||
VoiceModelOption,
|
VoiceModelOption,
|
||||||
|
WebScraper,
|
||||||
)
|
)
|
||||||
from khoj.utils.helpers import ImageIntentType
|
from khoj.utils.helpers import ImageIntentType
|
||||||
|
|
||||||
|
@ -202,6 +203,19 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(WebScraper)
|
||||||
|
class WebScraperAdmin(admin.ModelAdmin):
|
||||||
|
list_display = (
|
||||||
|
"name",
|
||||||
|
"type",
|
||||||
|
"api_key",
|
||||||
|
"api_url",
|
||||||
|
"created_at",
|
||||||
|
)
|
||||||
|
search_fields = ("name", "api_key", "api_url", "type")
|
||||||
|
ordering = ("-created_at",)
|
||||||
|
|
||||||
|
|
||||||
@admin.register(Conversation)
|
@admin.register(Conversation)
|
||||||
class ConversationAdmin(admin.ModelAdmin):
|
class ConversationAdmin(admin.ModelAdmin):
|
||||||
list_display = (
|
list_display = (
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
# Generated by Django 5.0.8 on 2024-10-16 00:06
|
|
||||||
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
dependencies = [
|
|
||||||
("database", "0067_alter_agent_style_icon"),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AddField(
|
|
||||||
model_name="serverchatsettings",
|
|
||||||
name="web_scraper",
|
|
||||||
field=models.CharField(
|
|
||||||
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")],
|
|
||||||
default="jinaai",
|
|
||||||
max_length=20,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
# Generated by Django 5.0.8 on 2024-10-16 06:51
|
||||||
|
|
||||||
|
import django.db.models.deletion
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
dependencies = [
|
||||||
|
("database", "0067_alter_agent_style_icon"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name="WebScraper",
|
||||||
|
fields=[
|
||||||
|
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||||
|
("created_at", models.DateTimeField(auto_now_add=True)),
|
||||||
|
("updated_at", models.DateTimeField(auto_now=True)),
|
||||||
|
("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)),
|
||||||
|
(
|
||||||
|
"type",
|
||||||
|
models.CharField(
|
||||||
|
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")],
|
||||||
|
default="jina",
|
||||||
|
max_length=20,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)),
|
||||||
|
("api_url", models.URLField(blank=True, default=None, null=True)),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
"abstract": False,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="serverchatsettings",
|
||||||
|
name="web_scraper",
|
||||||
|
field=models.ForeignKey(
|
||||||
|
blank=True,
|
||||||
|
default=None,
|
||||||
|
null=True,
|
||||||
|
on_delete=django.db.models.deletion.CASCADE,
|
||||||
|
related_name="web_scraper",
|
||||||
|
to="database.webscraper",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
|
@ -1,3 +1,4 @@
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
from random import choice
|
from random import choice
|
||||||
|
@ -12,8 +13,6 @@ from django.utils.translation import gettext_lazy
|
||||||
from pgvector.django import VectorField
|
from pgvector.django import VectorField
|
||||||
from phonenumber_field.modelfields import PhoneNumberField
|
from phonenumber_field.modelfields import PhoneNumberField
|
||||||
|
|
||||||
from khoj.utils.helpers import ConversationCommand
|
|
||||||
|
|
||||||
|
|
||||||
class BaseModel(models.Model):
|
class BaseModel(models.Model):
|
||||||
created_at = models.DateTimeField(auto_now_add=True)
|
created_at = models.DateTimeField(auto_now_add=True)
|
||||||
|
@ -245,19 +244,58 @@ class GithubRepoConfig(BaseModel):
|
||||||
github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig")
|
github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig")
|
||||||
|
|
||||||
|
|
||||||
class ServerChatSettings(BaseModel):
|
class WebScraper(BaseModel):
|
||||||
class WebScraper(models.TextChoices):
|
class WebScraperType(models.TextChoices):
|
||||||
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
|
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
|
||||||
OLOSTEP = "olostep", gettext_lazy("Olostep")
|
OLOSTEP = "olostep", gettext_lazy("Olostep")
|
||||||
JINAAI = "jinaai", gettext_lazy("JinaAI")
|
JINA = "jina", gettext_lazy("Jina")
|
||||||
|
|
||||||
|
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
|
||||||
|
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
|
||||||
|
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
|
||||||
|
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
|
||||||
|
|
||||||
|
def clean(self):
|
||||||
|
error = {}
|
||||||
|
if self.name is None:
|
||||||
|
self.name = self.type.capitalize()
|
||||||
|
if self.api_url is None:
|
||||||
|
if self.type == self.WebScraperType.FIRECRAWL:
|
||||||
|
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||||
|
elif self.type == self.WebScraperType.OLOSTEP:
|
||||||
|
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||||
|
elif self.type == self.WebScraperType.JINA:
|
||||||
|
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||||
|
if self.api_key is None:
|
||||||
|
if self.type == self.WebScraperType.FIRECRAWL:
|
||||||
|
self.api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||||
|
if not self.api_key and self.api_url == "https://api.firecrawl.dev":
|
||||||
|
error["api_key"] = "Set API key to use default Firecrawl. Get API key from https://firecrawl.dev."
|
||||||
|
elif self.type == self.WebScraperType.OLOSTEP:
|
||||||
|
self.api_key = os.getenv("OLOSTEP_API_KEY")
|
||||||
|
if self.api_key is None:
|
||||||
|
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
|
||||||
|
elif self.type == self.WebScraperType.JINA:
|
||||||
|
self.api_key = os.getenv("JINA_API_KEY")
|
||||||
|
|
||||||
|
if error:
|
||||||
|
raise ValidationError(error)
|
||||||
|
|
||||||
|
def save(self, *args, **kwargs):
|
||||||
|
self.clean()
|
||||||
|
super().save(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class ServerChatSettings(BaseModel):
|
||||||
chat_default = models.ForeignKey(
|
chat_default = models.ForeignKey(
|
||||||
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
|
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
|
||||||
)
|
)
|
||||||
chat_advanced = models.ForeignKey(
|
chat_advanced = models.ForeignKey(
|
||||||
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
|
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
|
||||||
)
|
)
|
||||||
web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI)
|
web_scraper = models.ForeignKey(
|
||||||
|
WebScraper, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="web_scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class LocalOrgConfig(BaseModel):
|
class LocalOrgConfig(BaseModel):
|
||||||
|
|
|
@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
|
||||||
from markdownify import markdownify
|
from markdownify import markdownify
|
||||||
|
|
||||||
from khoj.database.adapters import ConversationAdapters
|
from khoj.database.adapters import ConversationAdapters
|
||||||
from khoj.database.models import Agent, KhojUser, ServerChatSettings
|
from khoj.database.models import Agent, KhojUser, WebScraper
|
||||||
from khoj.processor.conversation import prompts
|
from khoj.processor.conversation import prompts
|
||||||
from khoj.routers.helpers import (
|
from khoj.routers.helpers import (
|
||||||
ChatEvent,
|
ChatEvent,
|
||||||
|
@ -27,16 +27,11 @@ logger = logging.getLogger(__name__)
|
||||||
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
|
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
|
||||||
SERPER_DEV_URL = "https://google.serper.dev/search"
|
SERPER_DEV_URL = "https://google.serper.dev/search"
|
||||||
|
|
||||||
JINA_READER_API_URL = "https://r.jina.ai/"
|
|
||||||
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
||||||
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
||||||
|
|
||||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
|
||||||
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
|
||||||
FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
|
FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
|
||||||
|
|
||||||
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
|
|
||||||
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
|
|
||||||
OLOSTEP_QUERY_PARAMS = {
|
OLOSTEP_QUERY_PARAMS = {
|
||||||
"timeout": 35, # seconds
|
"timeout": 35, # seconds
|
||||||
"waitBeforeScraping": 1, # seconds
|
"waitBeforeScraping": 1, # seconds
|
||||||
|
@ -175,29 +170,47 @@ async def read_webpages(
|
||||||
yield response
|
yield response
|
||||||
|
|
||||||
|
|
||||||
|
async def read_webpage(
|
||||||
|
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
||||||
|
) -> Tuple[str | None, str | None]:
|
||||||
|
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT:
|
||||||
|
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
||||||
|
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
||||||
|
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
||||||
|
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
||||||
|
return await read_webpage_with_olostep(url, api_key, api_url), None
|
||||||
|
else:
|
||||||
|
return await read_webpage_with_jina(url, api_key, api_url), None
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage_and_extract_content(
|
async def read_webpage_and_extract_content(
|
||||||
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
||||||
) -> Tuple[set[str], str, Union[None, str]]:
|
) -> Tuple[set[str], str, Union[None, str]]:
|
||||||
# Select the web scraper to use for reading the web page
|
# Select the web scrapers to use for reading the web page
|
||||||
web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY)
|
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
|
||||||
|
|
||||||
|
# Fallback through enabled web scrapers until we successfully read the web page
|
||||||
extracted_info = None
|
extracted_info = None
|
||||||
try:
|
for scraper_type, api_key, api_url, api_name in web_scrapers:
|
||||||
if is_none_or_empty(content):
|
try:
|
||||||
with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger, log_level=logging.INFO):
|
# Read the web page
|
||||||
if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL:
|
if is_none_or_empty(content):
|
||||||
if FIRECRAWL_TO_EXTRACT:
|
with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO):
|
||||||
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent)
|
content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent)
|
||||||
else:
|
|
||||||
content = await read_webpage_with_firecrawl(url)
|
# Extract relevant information from the web page
|
||||||
elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP:
|
if is_none_or_empty(extracted_info):
|
||||||
content = await read_webpage_with_olostep(url)
|
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||||
else:
|
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
||||||
content = await read_webpage_with_jina(url)
|
|
||||||
if is_none_or_empty(extracted_info):
|
# If we successfully extracted information, break the loop
|
||||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
if not is_none_or_empty(extracted_info):
|
||||||
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}")
|
logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}")
|
||||||
|
# If this is the last web scraper in the list, log an error
|
||||||
|
if api_name == web_scrapers[-1][-1]:
|
||||||
|
logger.error(f"All web scrapers failed for '{url}'")
|
||||||
|
|
||||||
return subqueries, url, extracted_info
|
return subqueries, url, extracted_info
|
||||||
|
|
||||||
|
@ -216,23 +229,23 @@ async def read_webpage_at_url(web_url: str) -> str:
|
||||||
return markdownify(body)
|
return markdownify(body)
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage_with_olostep(web_url: str) -> str:
|
async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
|
||||||
headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
|
headers = {"Authorization": f"Bearer {api_key}"}
|
||||||
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
||||||
web_scraping_params["url"] = web_url
|
web_scraping_params["url"] = web_url
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response:
|
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
response_json = await response.json()
|
response_json = await response.json()
|
||||||
return response_json["markdown_content"]
|
return response_json["markdown_content"]
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage_with_jina(web_url: str) -> str:
|
async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
|
||||||
jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}"
|
jina_reader_api_url = f"{api_url}/{web_url}"
|
||||||
headers = {"Accept": "application/json", "X-Timeout": "30"}
|
headers = {"Accept": "application/json", "X-Timeout": "30"}
|
||||||
if JINA_API_KEY:
|
if api_key:
|
||||||
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
|
headers["Authorization"] = f"Bearer {api_key}"
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(jina_reader_api_url, headers=headers) as response:
|
async with session.get(jina_reader_api_url, headers=headers) as response:
|
||||||
|
@ -241,9 +254,9 @@ async def read_webpage_with_jina(web_url: str) -> str:
|
||||||
return response_json["data"]["content"]
|
return response_json["data"]["content"]
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage_with_firecrawl(web_url: str) -> str:
|
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
||||||
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
|
firecrawl_api_url = f"{api_url}/v1/scrape"
|
||||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||||
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
|
@ -253,9 +266,11 @@ async def read_webpage_with_firecrawl(web_url: str) -> str:
|
||||||
return response_json["data"]["markdown"]
|
return response_json["data"]["markdown"]
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage_and_extract_content_with_firecrawl(web_url: str, queries: set[str], agent: Agent = None) -> str:
|
async def query_webpage_with_firecrawl(
|
||||||
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
|
web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
|
||||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
) -> str:
|
||||||
|
firecrawl_api_url = f"{api_url}/v1/scrape"
|
||||||
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||||
schema = {
|
schema = {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
Loading…
Reference in a new issue