mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-12-18 10:37:11 +00:00
Support online search with Searxng as turnkey, self-hostable solution
This allows online search to work out of the box again for self-hosting users, as no auth/api key setup required. Docker users do not need to change anything in their setup flow. Direct installers can setup searxng locally or use public instances if they do not want to use any of the other providers (like Jina, Serper) Resolves #749. Resolves #990
This commit is contained in:
parent
9c64275dec
commit
9d02978f6e
2 changed files with 61 additions and 2 deletions
|
@ -18,6 +18,15 @@ services:
|
||||||
image: ghcr.io/khoj-ai/terrarium:latest
|
image: ghcr.io/khoj-ai/terrarium:latest
|
||||||
ports:
|
ports:
|
||||||
- "8080:8080"
|
- "8080:8080"
|
||||||
|
search:
|
||||||
|
image: docker.io/searxng/searxng:latest
|
||||||
|
ports:
|
||||||
|
- "42113:8080"
|
||||||
|
volumes:
|
||||||
|
- khoj_search:/etc/searxng
|
||||||
|
environment:
|
||||||
|
- SEARXNG_BASE_URL=http://localhost:42113/
|
||||||
|
- SEARXNG_SECRET_KEY=change_me_to_something_random # Change this in production
|
||||||
server:
|
server:
|
||||||
depends_on:
|
depends_on:
|
||||||
database:
|
database:
|
||||||
|
@ -51,6 +60,8 @@ services:
|
||||||
- KHOJ_ADMIN_PASSWORD=password
|
- KHOJ_ADMIN_PASSWORD=password
|
||||||
# Default URL of Terrarium, the Python sandbox used by Khoj to run code. Its container is specified above
|
# Default URL of Terrarium, the Python sandbox used by Khoj to run code. Its container is specified above
|
||||||
- KHOJ_TERRARIUM_URL=http://host.docker.internal:8080
|
- KHOJ_TERRARIUM_URL=http://host.docker.internal:8080
|
||||||
|
# Default URL of SearxNG, the default web search engine used by Khoj. Its container is specified above
|
||||||
|
- KHOJ_SEARXNG_URL=http://host.docker.internal:42113
|
||||||
# Uncomment line below to use with Ollama running on your local machine at localhost:11434.
|
# Uncomment line below to use with Ollama running on your local machine at localhost:11434.
|
||||||
# Change URL to use with other OpenAI API compatible providers like VLLM, LMStudio etc.
|
# Change URL to use with other OpenAI API compatible providers like VLLM, LMStudio etc.
|
||||||
# - OPENAI_API_BASE=http://host.docker.internal:11434/v1/
|
# - OPENAI_API_BASE=http://host.docker.internal:11434/v1/
|
||||||
|
@ -93,3 +104,4 @@ volumes:
|
||||||
khoj_config:
|
khoj_config:
|
||||||
khoj_db:
|
khoj_db:
|
||||||
khoj_models:
|
khoj_models:
|
||||||
|
khoj_search:
|
||||||
|
|
|
@ -102,8 +102,14 @@ async def search_online(
|
||||||
async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
|
async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
|
||||||
yield {ChatEvent.STATUS: event}
|
yield {ChatEvent.STATUS: event}
|
||||||
|
|
||||||
|
if SERPER_DEV_API_KEY:
|
||||||
|
search_func = search_with_serper
|
||||||
|
elif JINA_API_KEY:
|
||||||
|
search_func = search_with_jina
|
||||||
|
else:
|
||||||
|
search_func = search_with_searxng
|
||||||
|
|
||||||
with timer(f"Internet searches for {subqueries} took", logger):
|
with timer(f"Internet searches for {subqueries} took", logger):
|
||||||
search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
|
|
||||||
search_tasks = [search_func(subquery, location) for subquery in subqueries]
|
search_tasks = [search_func(subquery, location) for subquery in subqueries]
|
||||||
search_results = await asyncio.gather(*search_tasks)
|
search_results = await asyncio.gather(*search_tasks)
|
||||||
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
||||||
|
@ -148,7 +154,48 @@ async def search_online(
|
||||||
yield response_dict
|
yield response_dict
|
||||||
|
|
||||||
|
|
||||||
async def search_with_google(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
async def search_with_searxng(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
||||||
|
"""Search using local SearXNG instance."""
|
||||||
|
# Use environment variable or default to localhost
|
||||||
|
searxng_url = os.getenv("KHOJ_SEARXNG_URL", "http://localhost:42113")
|
||||||
|
search_url = f"{searxng_url}/search"
|
||||||
|
country_code = location.country_code.lower() if location and location.country_code else "us"
|
||||||
|
|
||||||
|
params = {"q": query, "format": "html", "language": "en", "country": country_code, "categories": "general"}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
try:
|
||||||
|
async with session.get(search_url, params=params) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
logger.error(f"SearXNG search failed to call {searxng_url}: {await response.text()}")
|
||||||
|
return query, {}
|
||||||
|
|
||||||
|
html_content = await response.text()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
organic_results = []
|
||||||
|
|
||||||
|
for result in soup.find_all("article", class_="result"):
|
||||||
|
title_elem = result.find("a", rel="noreferrer")
|
||||||
|
if title_elem:
|
||||||
|
title = title_elem.text.strip()
|
||||||
|
link = title_elem["href"]
|
||||||
|
|
||||||
|
description_elem = result.find("p", class_="content")
|
||||||
|
description = description_elem.text.strip() if description_elem else None
|
||||||
|
|
||||||
|
organic_results.append({"title": title, "link": link, "description": description})
|
||||||
|
|
||||||
|
extracted_search_result = {"organic": organic_results}
|
||||||
|
|
||||||
|
return query, extracted_search_result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error searching with SearXNG: {str(e)}")
|
||||||
|
return query, {}
|
||||||
|
|
||||||
|
|
||||||
|
async def search_with_serper(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
||||||
country_code = location.country_code.lower() if location and location.country_code else "us"
|
country_code = location.country_code.lower() if location and location.country_code else "us"
|
||||||
payload = json.dumps({"q": query, "gl": country_code})
|
payload = json.dumps({"q": query, "gl": country_code})
|
||||||
headers = {"X-API-KEY": SERPER_DEV_API_KEY, "Content-Type": "application/json"}
|
headers = {"X-API-KEY": SERPER_DEV_API_KEY, "Content-Type": "application/json"}
|
||||||
|
|
Loading…
Reference in a new issue