Change webpage scraper to use via server admin panel

This commit is contained in:
Debanjum Singh Solanky 2024-10-15 17:17:36 -07:00
parent e47922e53a
commit c841abe13f
5 changed files with 50 additions and 5 deletions

View file

@ -1031,6 +1031,19 @@ class ConversationAdapters:
return server_chat_settings.chat_advanced return server_chat_settings.chat_advanced
return await ConversationAdapters.aget_default_conversation_config(user) return await ConversationAdapters.aget_default_conversation_config(user)
@staticmethod
async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None):
server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst()
if server_chat_settings is not None and server_chat_settings.web_scraper is not None:
web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper)
if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or (
web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY
):
return web_scraper
# Fallback to JinaAI if the API keys for the other providers are not set
# JinaAI is the default web scraper as it does not require an API key
return ServerChatSettings.WebScraper.JINAAI
@staticmethod @staticmethod
def create_conversation_from_public_conversation( def create_conversation_from_public_conversation(
user: KhojUser, public_conversation: PublicConversation, client_app: ClientApplication user: KhojUser, public_conversation: PublicConversation, client_app: ClientApplication

View file

@ -198,6 +198,7 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
list_display = ( list_display = (
"chat_default", "chat_default",
"chat_advanced", "chat_advanced",
"web_scraper",
) )

View file

@ -0,0 +1,21 @@
# Generated by Django 5.0.8 on 2024-10-16 00:06
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0067_alter_agent_style_icon"),
]
operations = [
migrations.AddField(
model_name="serverchatsettings",
name="web_scraper",
field=models.CharField(
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")],
default="jinaai",
max_length=20,
),
),
]

View file

@ -8,6 +8,7 @@ from django.core.exceptions import ValidationError
from django.db import models from django.db import models
from django.db.models.signals import pre_save from django.db.models.signals import pre_save
from django.dispatch import receiver from django.dispatch import receiver
from django.utils.translation import gettext_lazy
from pgvector.django import VectorField from pgvector.django import VectorField
from phonenumber_field.modelfields import PhoneNumberField from phonenumber_field.modelfields import PhoneNumberField
@ -245,12 +246,18 @@ class GithubRepoConfig(BaseModel):
class ServerChatSettings(BaseModel): class ServerChatSettings(BaseModel):
class WebScraper(models.TextChoices):
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
OLOSTEP = "olostep", gettext_lazy("Olostep")
JINAAI = "jinaai", gettext_lazy("JinaAI")
chat_default = models.ForeignKey( chat_default = models.ForeignKey(
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default" ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
) )
chat_advanced = models.ForeignKey( chat_advanced = models.ForeignKey(
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced" ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
) )
web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI)
class LocalOrgConfig(BaseModel): class LocalOrgConfig(BaseModel):

View file

@ -10,7 +10,8 @@ import aiohttp
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from markdownify import markdownify from markdownify import markdownify
from khoj.database.models import Agent, KhojUser from khoj.database.adapters import ConversationAdapters
from khoj.database.models import Agent, KhojUser, ServerChatSettings
from khoj.processor.conversation import prompts from khoj.processor.conversation import prompts
from khoj.routers.helpers import ( from khoj.routers.helpers import (
ChatEvent, ChatEvent,
@ -177,16 +178,18 @@ async def read_webpages(
async def read_webpage_and_extract_content( async def read_webpage_and_extract_content(
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
) -> Tuple[set[str], str, Union[None, str]]: ) -> Tuple[set[str], str, Union[None, str]]:
# Select the web scraper to use for reading the web page
web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY)
extracted_info = None extracted_info = None
try: try:
if is_none_or_empty(content): if is_none_or_empty(content):
with timer(f"Reading web page at '{url}' took", logger): with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger):
if FIRECRAWL_API_KEY: if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL:
if FIRECRAWL_TO_EXTRACT: if FIRECRAWL_TO_EXTRACT:
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent) extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent)
else: else:
content = await read_webpage_with_firecrawl(url) content = await read_webpage_with_firecrawl(url)
elif OLOSTEP_API_KEY: elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP:
content = await read_webpage_with_olostep(url) content = await read_webpage_with_olostep(url)
else: else:
content = await read_webpage_with_jina(url) content = await read_webpage_with_jina(url)
@ -194,7 +197,7 @@ async def read_webpage_and_extract_content(
with timer(f"Extracting relevant information from web page at '{url}' took", logger): with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent) extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
except Exception as e: except Exception as e:
logger.error(f"Failed to read web page at '{url}' with {e}") logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}")
return subqueries, url, extracted_info return subqueries, url, extracted_info