diff --git a/docker-compose.yml b/docker-compose.yml index 572e592e..d174bc5a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,6 +18,15 @@ services: image: ghcr.io/khoj-ai/terrarium:latest ports: - "8080:8080" + search: + image: docker.io/searxng/searxng:latest + ports: + - "42113:8080" + volumes: + - khoj_search:/etc/searxng + environment: + - SEARXNG_BASE_URL=http://localhost:42113/ + - SEARXNG_SECRET_KEY=change_me_to_something_random # Change this in production server: depends_on: database: @@ -51,6 +60,8 @@ services: - KHOJ_ADMIN_PASSWORD=password # Default URL of Terrarium, the Python sandbox used by Khoj to run code. Its container is specified above - KHOJ_TERRARIUM_URL=http://host.docker.internal:8080 + # Default URL of SearxNG, the default web search engine used by Khoj. Its container is specified above + - KHOJ_SEARXNG_URL=http://host.docker.internal:42113 # Uncomment line below to use with Ollama running on your local machine at localhost:11434. # Change URL to use with other OpenAI API compatible providers like VLLM, LMStudio etc. # - OPENAI_API_BASE=http://host.docker.internal:11434/v1/ @@ -93,3 +104,4 @@ volumes: khoj_config: khoj_db: khoj_models: + khoj_search: diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 8795b358..1e14ee9d 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -102,8 +102,14 @@ async def search_online( async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"): yield {ChatEvent.STATUS: event} + if SERPER_DEV_API_KEY: + search_func = search_with_serper + elif JINA_API_KEY: + search_func = search_with_jina + else: + search_func = search_with_searxng + with timer(f"Internet searches for {subqueries} took", logger): - search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina search_tasks = [search_func(subquery, location) for subquery in subqueries] search_results = await asyncio.gather(*search_tasks) response_dict = {subquery: search_result for subquery, search_result in search_results} @@ -148,7 +154,48 @@ async def search_online( yield response_dict -async def search_with_google(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: +async def search_with_searxng(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: + """Search using local SearXNG instance.""" + # Use environment variable or default to localhost + searxng_url = os.getenv("KHOJ_SEARXNG_URL", "http://localhost:42113") + search_url = f"{searxng_url}/search" + country_code = location.country_code.lower() if location and location.country_code else "us" + + params = {"q": query, "format": "html", "language": "en", "country": country_code, "categories": "general"} + + async with aiohttp.ClientSession() as session: + try: + async with session.get(search_url, params=params) as response: + if response.status != 200: + logger.error(f"SearXNG search failed to call {searxng_url}: {await response.text()}") + return query, {} + + html_content = await response.text() + + soup = BeautifulSoup(html_content, "html.parser") + organic_results = [] + + for result in soup.find_all("article", class_="result"): + title_elem = result.find("a", rel="noreferrer") + if title_elem: + title = title_elem.text.strip() + link = title_elem["href"] + + description_elem = result.find("p", class_="content") + description = description_elem.text.strip() if description_elem else None + + organic_results.append({"title": title, "link": link, "description": description}) + + extracted_search_result = {"organic": organic_results} + + return query, extracted_search_result + + except Exception as e: + logger.error(f"Error searching with SearXNG: {str(e)}") + return query, {} + + +async def search_with_serper(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: country_code = location.country_code.lower() if location and location.country_code else "us" payload = json.dumps({"q": query, "gl": country_code}) headers = {"X-API-KEY": SERPER_DEV_API_KEY, "Content-Type": "application/json"}