From d5ceff269133048726c1e0223e4acebf4c37aa61 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 22 Jun 2024 13:45:08 +0530 Subject: [PATCH] Update tests and documentation with Jina reader API usage and info Update offline, openai chat actor, director tests to not require Serper to run the online command tests Update documentation for self-hosted online search to mention no setup is required by default. But improvements can be made by using Serper.dev or Olostep --- documentation/docs/features/online_search.md | 14 +++++++----- tests/test_offline_chat_director.py | 23 ++++++++++---------- tests/test_openai_chat_director.py | 23 ++++++++++---------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/documentation/docs/features/online_search.md b/documentation/docs/features/online_search.md index 894d515d..956eac0f 100644 --- a/documentation/docs/features/online_search.md +++ b/documentation/docs/features/online_search.md @@ -1,17 +1,21 @@ # Online Search -By default, Khoj will try to infer which information-sourcing tools are required to answer your question. Sometimes, you'll have a need for outside questions that the LLM's knowledge doesn't cover. In that case, it will use the `online` search feature. +Khoj will research on the internet to ground its responses, when it determines that it would need fresh information outside its existing knowledge to answer the query. It will always show any online references it used to respond to your requests. -For example, these queries would trigger an online search: +By default, Khoj will try to infer which information sources, it needs to read to answer your question. This can include reading your documents or researching information online. You can also explicitly trigger an online search by adding the `/online` prefix to your chat query. + +Example queries that should trigger an online search: - What's the latest news about the Israel-Palestine war? - Where can I find the best pizza in New York City? -- Deadline for filing taxes 2024. +- /online Deadline for filing taxes 2024. - Give me a summary of this article: https://en.wikipedia.org/wiki/Haitian_Revolution Try it out yourself! https://app.khoj.dev ## Self-Hosting -The general online search function currently requires an API key from Serper.dev. You can grab one here: https://serper.dev/, and then add it as an environment variable with the name `SERPER_DEV_API_KEY`. +Online search works out of the box even when self-hosting. Khoj uses [JinaAI's reader API](https://jina.ai/reader/) to search online and read webpages by default. No API key setup is necessary. -Without any API keys, Khoj will use the `requests` library to directly read any webpages you give it a link to. This means that you can use Khoj to read any webpage that you have access in your local network. +To improve online search, set the `SERPER_DEV_API_KEY` environment variable to your [Serper.dev](https://serper.dev/) API key. These search results include additional context like answer box, knowledge graph etc. + +For advanced webpage reading, set the `OLOSTEP_API_KEY` environment variable to your [Olostep](https://www.olostep.com/) API key. This has a higher success rate at reading webpages than the default webpage reader. diff --git a/tests/test_offline_chat_director.py b/tests/test_offline_chat_director.py index 1c4ad3ac..a72dae56 100644 --- a/tests/test_offline_chat_director.py +++ b/tests/test_offline_chat_director.py @@ -62,7 +62,6 @@ def test_offline_chat_with_no_chat_history_or_retrieved_content(client_offline_c # ---------------------------------------------------------------------------------------------------- -@pytest.mark.skipif(os.getenv("SERPER_DEV_API_KEY") is None, reason="requires SERPER_DEV_API_KEY") @pytest.mark.chatquality @pytest.mark.django_db(transaction=True) def test_chat_with_online_content(client_offline_chat): @@ -75,18 +74,18 @@ def test_chat_with_online_content(client_offline_chat): response_message = response_message.split("### compiled references")[0] # Assert - expected_responses = ["http://www.paulgraham.com/greatwork.html"] + expected_responses = [ + "https://paulgraham.com/greatwork.html", + "https://www.paulgraham.com/greatwork.html", + "http://www.paulgraham.com/greatwork.html", + ] assert response.status_code == 200 - assert any([expected_response in response_message for expected_response in expected_responses]), ( - "Expected links or serper not setup in response but got: " + response_message - ) + assert any( + [expected_response in response_message for expected_response in expected_responses] + ), f"Expected links: {expected_responses}. Actual response: {response_message}" # ---------------------------------------------------------------------------------------------------- -@pytest.mark.skipif( - os.getenv("SERPER_DEV_API_KEY") is None or os.getenv("OLOSTEP_API_KEY") is None, - reason="requires SERPER_DEV_API_KEY and OLOSTEP_API_KEY", -) @pytest.mark.chatquality @pytest.mark.django_db(transaction=True) def test_chat_with_online_webpage_content(client_offline_chat): @@ -101,9 +100,9 @@ def test_chat_with_online_webpage_content(client_offline_chat): # Assert expected_responses = ["185", "1871", "horse"] assert response.status_code == 200 - assert any([expected_response in response_message for expected_response in expected_responses]), ( - "Expected links or serper not setup in response but got: " + response_message - ) + assert any( + [expected_response in response_message for expected_response in expected_responses] + ), f"Expected response with {expected_responses}. But actual response had: {response_message}" # ---------------------------------------------------------------------------------------------------- diff --git a/tests/test_openai_chat_director.py b/tests/test_openai_chat_director.py index b547f78e..26d93d31 100644 --- a/tests/test_openai_chat_director.py +++ b/tests/test_openai_chat_director.py @@ -61,7 +61,6 @@ def test_chat_with_no_chat_history_or_retrieved_content(chat_client): # ---------------------------------------------------------------------------------------------------- -@pytest.mark.skipif(os.getenv("SERPER_DEV_API_KEY") is None, reason="requires SERPER_DEV_API_KEY") @pytest.mark.chatquality @pytest.mark.django_db(transaction=True) def test_chat_with_online_content(chat_client): @@ -74,18 +73,18 @@ def test_chat_with_online_content(chat_client): response_message = response_message.split("### compiled references")[0] # Assert - expected_responses = ["http://www.paulgraham.com/greatwork.html"] + expected_responses = [ + "https://paulgraham.com/greatwork.html", + "https://www.paulgraham.com/greatwork.html", + "http://www.paulgraham.com/greatwork.html", + ] assert response.status_code == 200 - assert any([expected_response in response_message for expected_response in expected_responses]), ( - "Expected links or serper not setup in response but got: " + response_message - ) + assert any( + [expected_response in response_message for expected_response in expected_responses] + ), f"Expected links: {expected_responses}. Actual response: {response_message}" # ---------------------------------------------------------------------------------------------------- -@pytest.mark.skipif( - os.getenv("SERPER_DEV_API_KEY") is None or os.getenv("OLOSTEP_API_KEY") is None, - reason="requires SERPER_DEV_API_KEY and OLOSTEP_API_KEY", -) @pytest.mark.chatquality @pytest.mark.django_db(transaction=True) def test_chat_with_online_webpage_content(chat_client): @@ -100,9 +99,9 @@ def test_chat_with_online_webpage_content(chat_client): # Assert expected_responses = ["185", "1871", "horse"] assert response.status_code == 200 - assert any([expected_response in response_message for expected_response in expected_responses]), ( - "Expected links or serper not setup in response but got: " + response_message - ) + assert any( + [expected_response in response_message for expected_response in expected_responses] + ), f"Expected links: {expected_responses}. Actual response: {response_message}" # ----------------------------------------------------------------------------------------------------