diff --git a/docker-compose.yml b/docker-compose.yml index 572e592e..3a8c90ec 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,11 +18,19 @@ services: image: ghcr.io/khoj-ai/terrarium:latest ports: - "8080:8080" + search: + image: docker.io/searxng/searxng:latest + ports: + - "42113:8080" + volumes: + - khoj_search:/etc/searxng + environment: + - SEARXNG_BASE_URL=http://localhost:42113/ server: depends_on: database: condition: service_healthy - # Use the following line to use the latest version of khoj. Otherwise, it will build from source. Set this to ghcr.io/khoj-ai/khoj-cloud if you want to use the prod image. + # Use the following line to use the latest version of khoj. Otherwise, it will build from source. Set this to ghcr.io/khoj-ai/khoj-cloud:latest if you want to use the prod image. image: ghcr.io/khoj-ai/khoj:latest # Uncomment the following line to build from source. This will take a few minutes. Comment the next two lines out if you want to use the official image. # build: @@ -51,6 +59,8 @@ services: - KHOJ_ADMIN_PASSWORD=password # Default URL of Terrarium, the Python sandbox used by Khoj to run code. Its container is specified above - KHOJ_TERRARIUM_URL=http://host.docker.internal:8080 + # Default URL of SearxNG, the default web search engine used by Khoj. Its container is specified above + - KHOJ_SEARXNG_URL=http://host.docker.internal:42113 # Uncomment line below to use with Ollama running on your local machine at localhost:11434. # Change URL to use with other OpenAI API compatible providers like VLLM, LMStudio etc. # - OPENAI_API_BASE=http://host.docker.internal:11434/v1/ @@ -85,7 +95,7 @@ services: # Telemetry helps us prioritize feature development and understand how people are using Khoj # Read more at https://docs.khoj.dev/miscellaneous/telemetry # - KHOJ_TELEMETRY_DISABLE=True - # Comment out this line when you're using the official ghcr.io/khoj-ai/khoj-cloud prod image. + # Comment out this line when you're using the official ghcr.io/khoj-ai/khoj-cloud:latest prod image. command: --host="0.0.0.0" --port=42110 -vv --anonymous-mode --non-interactive @@ -93,3 +103,4 @@ volumes: khoj_config: khoj_db: khoj_models: + khoj_search: diff --git a/documentation/docs/features/online_search.md b/documentation/docs/features/online_search.md index 7bbbb1f9..de9e0926 100644 --- a/documentation/docs/features/online_search.md +++ b/documentation/docs/features/online_search.md @@ -14,8 +14,18 @@ Try it out yourself! https://app.khoj.dev ## Self-Hosting -Online search can work even with self-hosting! Khoj uses [JinaAI's reader API](https://jina.ai/reader/) to search online and read webpages by default. You can get a free API key via https://jina.ai/reader. Set the `JINA_API_KEY` environment variable to your Jina AI reader API key to enable online search. +### Search -To improve online search, set the `SERPER_DEV_API_KEY` environment variable to your [Serper.dev](https://serper.dev/) API key. These search results include additional context like answer box, knowledge graph etc. +Online search can work even with self-hosting! You have a few options: -For advanced webpage reading, set the `OLOSTEP_API_KEY` environment variable to your [Olostep](https://www.olostep.com/) API key. This has a higher success rate at reading webpages than the default webpage reader. +- If you're using Docker, online search should work out of the box with [searxng](https://github.com/searxng/searxng) using our standard `docker-compose.yml`. +- For a non-local, free solution, you can use [JinaAI's reader API](https://jina.ai/reader/) to search online and read webpages. You can get a free API key via https://jina.ai/reader. Set the `JINA_API_KEY` environment variable to your Jina AI reader API key to enable online search. +- To get production-grade, fast online search, set the `SERPER_DEV_API_KEY` environment variable to your [Serper.dev](https://serper.dev/) API key. These search results include additional context like answer box, knowledge graph etc. + +### Webpage Reading + +Out of the box, you **don't have to do anything to enable webpage reading**. Khoj will automatically read webpages by using the `requests` library. To get more distributed and scalable webpage reading, you can use the following options: + +- If you're using Jina AI's reader API for search, it should work automatically for webpage reading as well. +- For scalable webpage scraping, you can use [Firecrawl](https://www.firecrawl.dev/). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Firecrawl API key to the Api Key field, and set the type to Firecrawl. +- For advanced webpage reading, you can use [Olostep](https://www.olostep.com/). This has a higher success rate at reading webpages than the default webpage readers. Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Olostep API key to the Api Key field, and set the type to Olostep. diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 8795b358..1e14ee9d 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -102,8 +102,14 @@ async def search_online( async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"): yield {ChatEvent.STATUS: event} + if SERPER_DEV_API_KEY: + search_func = search_with_serper + elif JINA_API_KEY: + search_func = search_with_jina + else: + search_func = search_with_searxng + with timer(f"Internet searches for {subqueries} took", logger): - search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina search_tasks = [search_func(subquery, location) for subquery in subqueries] search_results = await asyncio.gather(*search_tasks) response_dict = {subquery: search_result for subquery, search_result in search_results} @@ -148,7 +154,48 @@ async def search_online( yield response_dict -async def search_with_google(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: +async def search_with_searxng(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: + """Search using local SearXNG instance.""" + # Use environment variable or default to localhost + searxng_url = os.getenv("KHOJ_SEARXNG_URL", "http://localhost:42113") + search_url = f"{searxng_url}/search" + country_code = location.country_code.lower() if location and location.country_code else "us" + + params = {"q": query, "format": "html", "language": "en", "country": country_code, "categories": "general"} + + async with aiohttp.ClientSession() as session: + try: + async with session.get(search_url, params=params) as response: + if response.status != 200: + logger.error(f"SearXNG search failed to call {searxng_url}: {await response.text()}") + return query, {} + + html_content = await response.text() + + soup = BeautifulSoup(html_content, "html.parser") + organic_results = [] + + for result in soup.find_all("article", class_="result"): + title_elem = result.find("a", rel="noreferrer") + if title_elem: + title = title_elem.text.strip() + link = title_elem["href"] + + description_elem = result.find("p", class_="content") + description = description_elem.text.strip() if description_elem else None + + organic_results.append({"title": title, "link": link, "description": description}) + + extracted_search_result = {"organic": organic_results} + + return query, extracted_search_result + + except Exception as e: + logger.error(f"Error searching with SearXNG: {str(e)}") + return query, {} + + +async def search_with_serper(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: country_code = location.country_code.lower() if location and location.country_code else "us" payload = json.dumps({"q": query, "gl": country_code}) headers = {"X-API-KEY": SERPER_DEV_API_KEY, "Content-Type": "application/json"}