mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Support using Firecrawl to read webpages
Firecrawl is open-source, self-hostable with a default hosted service provided, similar to Jina.ai. So it can be 1. Self-hosted as part of a private Khoj cloud deployment 2. Used directly by getting an API key from the Firecrawl.dev service This is as an alternative to Olostep and Jina.ai for reading webpages.
This commit is contained in:
parent
731ea3779e
commit
993fd7cd2b
1 changed files with 21 additions and 1 deletions
|
@ -29,6 +29,9 @@ JINA_READER_API_URL = "https://r.jina.ai/"
|
||||||
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
||||||
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
||||||
|
|
||||||
|
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
||||||
|
FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||||
|
|
||||||
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
|
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
|
||||||
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
|
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
|
||||||
OLOSTEP_QUERY_PARAMS = {
|
OLOSTEP_QUERY_PARAMS = {
|
||||||
|
@ -172,7 +175,12 @@ async def read_webpage_and_extract_content(
|
||||||
try:
|
try:
|
||||||
if is_none_or_empty(content):
|
if is_none_or_empty(content):
|
||||||
with timer(f"Reading web page at '{url}' took", logger):
|
with timer(f"Reading web page at '{url}' took", logger):
|
||||||
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url)
|
if FIRECRAWL_API_KEY:
|
||||||
|
content = await read_webpage_with_firecrawl(url)
|
||||||
|
elif OLOSTEP_API_KEY:
|
||||||
|
content = await read_webpage_with_olostep(url)
|
||||||
|
else:
|
||||||
|
content = await read_webpage_with_jina(url)
|
||||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||||
extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
|
extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
|
||||||
return subquery, extracted_info, url
|
return subquery, extracted_info, url
|
||||||
|
@ -220,6 +228,18 @@ async def read_webpage_with_jina(web_url: str) -> str:
|
||||||
return response_json["data"]["content"]
|
return response_json["data"]["content"]
|
||||||
|
|
||||||
|
|
||||||
|
async def read_webpage_with_firecrawl(web_url: str) -> str:
|
||||||
|
firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape"
|
||||||
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"}
|
||||||
|
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
response_json = await response.json()
|
||||||
|
return response_json["data"]["markdown"]
|
||||||
|
|
||||||
|
|
||||||
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
||||||
encoded_query = urllib.parse.quote(query)
|
encoded_query = urllib.parse.quote(query)
|
||||||
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
||||||
|
|
Loading…
Reference in a new issue