mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Change webpage scraper to use via server admin panel
This commit is contained in:
parent
e47922e53a
commit
c841abe13f
5 changed files with 50 additions and 5 deletions
|
@ -1031,6 +1031,19 @@ class ConversationAdapters:
|
||||||
return server_chat_settings.chat_advanced
|
return server_chat_settings.chat_advanced
|
||||||
return await ConversationAdapters.aget_default_conversation_config(user)
|
return await ConversationAdapters.aget_default_conversation_config(user)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None):
|
||||||
|
server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst()
|
||||||
|
if server_chat_settings is not None and server_chat_settings.web_scraper is not None:
|
||||||
|
web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper)
|
||||||
|
if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or (
|
||||||
|
web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY
|
||||||
|
):
|
||||||
|
return web_scraper
|
||||||
|
# Fallback to JinaAI if the API keys for the other providers are not set
|
||||||
|
# JinaAI is the default web scraper as it does not require an API key
|
||||||
|
return ServerChatSettings.WebScraper.JINAAI
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_conversation_from_public_conversation(
|
def create_conversation_from_public_conversation(
|
||||||
user: KhojUser, public_conversation: PublicConversation, client_app: ClientApplication
|
user: KhojUser, public_conversation: PublicConversation, client_app: ClientApplication
|
||||||
|
|
|
@ -198,6 +198,7 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
|
||||||
list_display = (
|
list_display = (
|
||||||
"chat_default",
|
"chat_default",
|
||||||
"chat_advanced",
|
"chat_advanced",
|
||||||
|
"web_scraper",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
# Generated by Django 5.0.8 on 2024-10-16 00:06
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
dependencies = [
|
||||||
|
("database", "0067_alter_agent_style_icon"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="serverchatsettings",
|
||||||
|
name="web_scraper",
|
||||||
|
field=models.CharField(
|
||||||
|
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")],
|
||||||
|
default="jinaai",
|
||||||
|
max_length=20,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
|
@ -8,6 +8,7 @@ from django.core.exceptions import ValidationError
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.db.models.signals import pre_save
|
from django.db.models.signals import pre_save
|
||||||
from django.dispatch import receiver
|
from django.dispatch import receiver
|
||||||
|
from django.utils.translation import gettext_lazy
|
||||||
from pgvector.django import VectorField
|
from pgvector.django import VectorField
|
||||||
from phonenumber_field.modelfields import PhoneNumberField
|
from phonenumber_field.modelfields import PhoneNumberField
|
||||||
|
|
||||||
|
@ -245,12 +246,18 @@ class GithubRepoConfig(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class ServerChatSettings(BaseModel):
|
class ServerChatSettings(BaseModel):
|
||||||
|
class WebScraper(models.TextChoices):
|
||||||
|
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
|
||||||
|
OLOSTEP = "olostep", gettext_lazy("Olostep")
|
||||||
|
JINAAI = "jinaai", gettext_lazy("JinaAI")
|
||||||
|
|
||||||
chat_default = models.ForeignKey(
|
chat_default = models.ForeignKey(
|
||||||
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
|
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
|
||||||
)
|
)
|
||||||
chat_advanced = models.ForeignKey(
|
chat_advanced = models.ForeignKey(
|
||||||
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
|
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
|
||||||
)
|
)
|
||||||
|
web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI)
|
||||||
|
|
||||||
|
|
||||||
class LocalOrgConfig(BaseModel):
|
class LocalOrgConfig(BaseModel):
|
||||||
|
|
|
@ -10,7 +10,8 @@ import aiohttp
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from markdownify import markdownify
|
from markdownify import markdownify
|
||||||
|
|
||||||
from khoj.database.models import Agent, KhojUser
|
from khoj.database.adapters import ConversationAdapters
|
||||||
|
from khoj.database.models import Agent, KhojUser, ServerChatSettings
|
||||||
from khoj.processor.conversation import prompts
|
from khoj.processor.conversation import prompts
|
||||||
from khoj.routers.helpers import (
|
from khoj.routers.helpers import (
|
||||||
ChatEvent,
|
ChatEvent,
|
||||||
|
@ -177,16 +178,18 @@ async def read_webpages(
|
||||||
async def read_webpage_and_extract_content(
|
async def read_webpage_and_extract_content(
|
||||||
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
||||||
) -> Tuple[set[str], str, Union[None, str]]:
|
) -> Tuple[set[str], str, Union[None, str]]:
|
||||||
|
# Select the web scraper to use for reading the web page
|
||||||
|
web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY)
|
||||||
extracted_info = None
|
extracted_info = None
|
||||||
try:
|
try:
|
||||||
if is_none_or_empty(content):
|
if is_none_or_empty(content):
|
||||||
with timer(f"Reading web page at '{url}' took", logger):
|
with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger):
|
||||||
if FIRECRAWL_API_KEY:
|
if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL:
|
||||||
if FIRECRAWL_TO_EXTRACT:
|
if FIRECRAWL_TO_EXTRACT:
|
||||||
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent)
|
extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent)
|
||||||
else:
|
else:
|
||||||
content = await read_webpage_with_firecrawl(url)
|
content = await read_webpage_with_firecrawl(url)
|
||||||
elif OLOSTEP_API_KEY:
|
elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP:
|
||||||
content = await read_webpage_with_olostep(url)
|
content = await read_webpage_with_olostep(url)
|
||||||
else:
|
else:
|
||||||
content = await read_webpage_with_jina(url)
|
content = await read_webpage_with_jina(url)
|
||||||
|
@ -194,7 +197,7 @@ async def read_webpage_and_extract_content(
|
||||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||||
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to read web page at '{url}' with {e}")
|
logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}")
|
||||||
|
|
||||||
return subqueries, url, extracted_info
|
return subqueries, url, extracted_info
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue