Support using Firecrawl to read webpages

Firecrawl is open-source, self-hostable with a default hosted service
provided, similar to Jina.ai. So it can be
1. Self-hosted as part of a private Khoj cloud deployment
2. Used directly by getting an API key from the Firecrawl.dev service

This is as an alternative to Olostep and Jina.ai for reading webpages.
This commit is contained in:
Debanjum Singh Solanky 2024-10-15 10:52:19 -07:00
parent 731ea3779e
commit 993fd7cd2b

View file

@ -29,6 +29,9 @@ JINA_READER_API_URL = "https://r.jina.ai/"
JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_SEARCH_API_URL = "https://s.jina.ai/"
JINA_API_KEY = os.getenv("JINA_API_KEY") JINA_API_KEY = os.getenv("JINA_API_KEY")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY") OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI" OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
OLOSTEP_QUERY_PARAMS = { OLOSTEP_QUERY_PARAMS = {
@ -172,7 +175,12 @@ async def read_webpage_and_extract_content(
try: try:
if is_none_or_empty(content): if is_none_or_empty(content):
with timer(f"Reading web page at '{url}' took", logger): with timer(f"Reading web page at '{url}' took", logger):
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url) if FIRECRAWL_API_KEY:
content = await read_webpage_with_firecrawl(url)
elif OLOSTEP_API_KEY:
content = await read_webpage_with_olostep(url)
else:
content = await read_webpage_with_jina(url)
with timer(f"Extracting relevant information from web page at '{url}' took", logger): with timer(f"Extracting relevant information from web page at '{url}' took", logger):
extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent) extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
return subquery, extracted_info, url return subquery, extracted_info, url
@ -220,6 +228,18 @@ async def read_webpage_with_jina(web_url: str) -> str:
return response_json["data"]["content"] return response_json["data"]["content"]
async def read_webpage_with_firecrawl(web_url: str) -> str:
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
async with aiohttp.ClientSession() as session:
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
response.raise_for_status()
response_json = await response.json()
return response_json["data"]["markdown"]
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
encoded_query = urllib.parse.quote(query) encoded_query = urllib.parse.quote(query)
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}" jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"