diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 51b8afe6..8c6aa5e4 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1,6 +1,7 @@ import json import logging import math +import os import random import re import secrets @@ -10,7 +11,6 @@ from enum import Enum from typing import Callable, Iterable, List, Optional, Type import cron_descriptor -import django from apscheduler.job import Job from asgiref.sync import sync_to_async from django.contrib.sessions.backends.db import SessionStore @@ -52,6 +52,7 @@ from khoj.database.models import ( UserTextToImageModelConfig, UserVoiceModelConfig, VoiceModelOption, + WebScraper, ) from khoj.processor.conversation import prompts from khoj.search_filter.date_filter import DateFilter @@ -1032,17 +1033,43 @@ class ConversationAdapters: return await ConversationAdapters.aget_default_conversation_config(user) @staticmethod - async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None): - server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst() + async def aget_server_webscraper(): + server_chat_settings = await ServerChatSettings.objects.filter().prefetch_related("web_scraper").afirst() if server_chat_settings is not None and server_chat_settings.web_scraper is not None: - web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper) - if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or ( - web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY - ): - return web_scraper - # Fallback to JinaAI if the API keys for the other providers are not set - # JinaAI is the default web scraper as it does not require an API key - return ServerChatSettings.WebScraper.JINAAI + return server_chat_settings.web_scraper + return None + + @staticmethod + async def aget_enabled_webscrapers(): + enabled_scrapers = [] + server_webscraper = await ConversationAdapters.aget_server_webscraper() + if server_webscraper: + # Only use the webscraper set in the server chat settings + enabled_scrapers = [ + (server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name) + ] + if not enabled_scrapers: + # Use the enabled web scrapers, using the newest created scraper first, until get web page content + enabled_scrapers = [ + (scraper.type, scraper.api_key, scraper.api_url, scraper.name) + async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator() + ] + if not enabled_scrapers: + # Use scrapers enabled via environment variables + if os.getenv("FIRECRAWL_API_KEY"): + api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") + enabled_scrapers.append( + (WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl") + ) + if os.getenv("OLOSTEP_API_KEY"): + api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") + enabled_scrapers.append( + (WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep") + ) + # Jina is the default fallback scraper to use as it does not require an API key + api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") + enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina")) + return enabled_scrapers @staticmethod def create_conversation_from_public_conversation( diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py index 51988752..8e650922 100644 --- a/src/khoj/database/admin.py +++ b/src/khoj/database/admin.py @@ -31,6 +31,7 @@ from khoj.database.models import ( UserSearchModelConfig, UserVoiceModelConfig, VoiceModelOption, + WebScraper, ) from khoj.utils.helpers import ImageIntentType @@ -202,6 +203,19 @@ class ServerChatSettingsAdmin(admin.ModelAdmin): ) +@admin.register(WebScraper) +class WebScraperAdmin(admin.ModelAdmin): + list_display = ( + "name", + "type", + "api_key", + "api_url", + "created_at", + ) + search_fields = ("name", "api_key", "api_url", "type") + ordering = ("-created_at",) + + @admin.register(Conversation) class ConversationAdmin(admin.ModelAdmin): list_display = ( diff --git a/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py deleted file mode 100644 index 89482dbd..00000000 --- a/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py +++ /dev/null @@ -1,21 +0,0 @@ -# Generated by Django 5.0.8 on 2024-10-16 00:06 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("database", "0067_alter_agent_style_icon"), - ] - - operations = [ - migrations.AddField( - model_name="serverchatsettings", - name="web_scraper", - field=models.CharField( - choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")], - default="jinaai", - max_length=20, - ), - ), - ] diff --git a/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py new file mode 100644 index 00000000..41d9c80b --- /dev/null +++ b/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py @@ -0,0 +1,47 @@ +# Generated by Django 5.0.8 on 2024-10-16 06:51 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0067_alter_agent_style_icon"), + ] + + operations = [ + migrations.CreateModel( + name="WebScraper", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)), + ( + "type", + models.CharField( + choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")], + default="jina", + max_length=20, + ), + ), + ("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)), + ("api_url", models.URLField(blank=True, default=None, null=True)), + ], + options={ + "abstract": False, + }, + ), + migrations.AddField( + model_name="serverchatsettings", + name="web_scraper", + field=models.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="web_scraper", + to="database.webscraper", + ), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 7c4a16fa..ec36c6f3 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -1,3 +1,4 @@ +import os import re import uuid from random import choice @@ -12,8 +13,6 @@ from django.utils.translation import gettext_lazy from pgvector.django import VectorField from phonenumber_field.modelfields import PhoneNumberField -from khoj.utils.helpers import ConversationCommand - class BaseModel(models.Model): created_at = models.DateTimeField(auto_now_add=True) @@ -245,19 +244,58 @@ class GithubRepoConfig(BaseModel): github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig") -class ServerChatSettings(BaseModel): - class WebScraper(models.TextChoices): +class WebScraper(BaseModel): + class WebScraperType(models.TextChoices): FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") OLOSTEP = "olostep", gettext_lazy("Olostep") - JINAAI = "jinaai", gettext_lazy("JinaAI") + JINA = "jina", gettext_lazy("Jina") + name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True) + type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) + api_key = models.CharField(max_length=200, default=None, null=True, blank=True) + api_url = models.URLField(max_length=200, default=None, null=True, blank=True) + + def clean(self): + error = {} + if self.name is None: + self.name = self.type.capitalize() + if self.api_url is None: + if self.type == self.WebScraperType.FIRECRAWL: + self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") + elif self.type == self.WebScraperType.OLOSTEP: + self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") + elif self.type == self.WebScraperType.JINA: + self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") + if self.api_key is None: + if self.type == self.WebScraperType.FIRECRAWL: + self.api_key = os.getenv("FIRECRAWL_API_KEY") + if not self.api_key and self.api_url == "https://api.firecrawl.dev": + error["api_key"] = "Set API key to use default Firecrawl. Get API key from https://firecrawl.dev." + elif self.type == self.WebScraperType.OLOSTEP: + self.api_key = os.getenv("OLOSTEP_API_KEY") + if self.api_key is None: + error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/." + elif self.type == self.WebScraperType.JINA: + self.api_key = os.getenv("JINA_API_KEY") + + if error: + raise ValidationError(error) + + def save(self, *args, **kwargs): + self.clean() + super().save(*args, **kwargs) + + +class ServerChatSettings(BaseModel): chat_default = models.ForeignKey( ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default" ) chat_advanced = models.ForeignKey( ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced" ) - web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI) + web_scraper = models.ForeignKey( + WebScraper, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="web_scraper" + ) class LocalOrgConfig(BaseModel): diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 2fbe8cf3..c111415b 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup from markdownify import markdownify from khoj.database.adapters import ConversationAdapters -from khoj.database.models import Agent, KhojUser, ServerChatSettings +from khoj.database.models import Agent, KhojUser, WebScraper from khoj.processor.conversation import prompts from khoj.routers.helpers import ( ChatEvent, @@ -27,16 +27,11 @@ logger = logging.getLogger(__name__) SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY") SERPER_DEV_URL = "https://google.serper.dev/search" -JINA_READER_API_URL = "https://r.jina.ai/" JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_API_KEY = os.getenv("JINA_API_KEY") -FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") -FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true" -OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY") -OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI" OLOSTEP_QUERY_PARAMS = { "timeout": 35, # seconds "waitBeforeScraping": 1, # seconds @@ -175,29 +170,47 @@ async def read_webpages( yield response +async def read_webpage( + url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None +) -> Tuple[str | None, str | None]: + if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT: + return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent) + elif scraper_type == WebScraper.WebScraperType.FIRECRAWL: + return await read_webpage_with_firecrawl(url, api_key, api_url), None + elif scraper_type == WebScraper.WebScraperType.OLOSTEP: + return await read_webpage_with_olostep(url, api_key, api_url), None + else: + return await read_webpage_with_jina(url, api_key, api_url), None + + async def read_webpage_and_extract_content( subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None ) -> Tuple[set[str], str, Union[None, str]]: - # Select the web scraper to use for reading the web page - web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY) + # Select the web scrapers to use for reading the web page + web_scrapers = await ConversationAdapters.aget_enabled_webscrapers() + + # Fallback through enabled web scrapers until we successfully read the web page extracted_info = None - try: - if is_none_or_empty(content): - with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger, log_level=logging.INFO): - if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL: - if FIRECRAWL_TO_EXTRACT: - extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent) - else: - content = await read_webpage_with_firecrawl(url) - elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP: - content = await read_webpage_with_olostep(url) - else: - content = await read_webpage_with_jina(url) - if is_none_or_empty(extracted_info): - with timer(f"Extracting relevant information from web page at '{url}' took", logger): - extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent) - except Exception as e: - logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}") + for scraper_type, api_key, api_url, api_name in web_scrapers: + try: + # Read the web page + if is_none_or_empty(content): + with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO): + content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent) + + # Extract relevant information from the web page + if is_none_or_empty(extracted_info): + with timer(f"Extracting relevant information from web page at '{url}' took", logger): + extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent) + + # If we successfully extracted information, break the loop + if not is_none_or_empty(extracted_info): + break + except Exception as e: + logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}") + # If this is the last web scraper in the list, log an error + if api_name == web_scrapers[-1][-1]: + logger.error(f"All web scrapers failed for '{url}'") return subqueries, url, extracted_info @@ -216,23 +229,23 @@ async def read_webpage_at_url(web_url: str) -> str: return markdownify(body) -async def read_webpage_with_olostep(web_url: str) -> str: - headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"} +async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str: + headers = {"Authorization": f"Bearer {api_key}"} web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore web_scraping_params["url"] = web_url async with aiohttp.ClientSession() as session: - async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response: + async with session.get(api_url, params=web_scraping_params, headers=headers) as response: response.raise_for_status() response_json = await response.json() return response_json["markdown_content"] -async def read_webpage_with_jina(web_url: str) -> str: - jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}" +async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str: + jina_reader_api_url = f"{api_url}/{web_url}" headers = {"Accept": "application/json", "X-Timeout": "30"} - if JINA_API_KEY: - headers["Authorization"] = f"Bearer {JINA_API_KEY}" + if api_key: + headers["Authorization"] = f"Bearer {api_key}" async with aiohttp.ClientSession() as session: async with session.get(jina_reader_api_url, headers=headers) as response: @@ -241,9 +254,9 @@ async def read_webpage_with_jina(web_url: str) -> str: return response_json["data"]["content"] -async def read_webpage_with_firecrawl(web_url: str) -> str: - firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} +async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str: + firecrawl_api_url = f"{api_url}/v1/scrape" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]} async with aiohttp.ClientSession() as session: @@ -253,9 +266,11 @@ async def read_webpage_with_firecrawl(web_url: str) -> str: return response_json["data"]["markdown"] -async def read_webpage_and_extract_content_with_firecrawl(web_url: str, queries: set[str], agent: Agent = None) -> str: - firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} +async def query_webpage_with_firecrawl( + web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None +) -> str: + firecrawl_api_url = f"{api_url}/v1/scrape" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} schema = { "type": "object", "properties": {