From 88f096977b22143a7ca02e4c707e5897c58d5c52 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 10 Mar 2024 00:08:48 +0530 Subject: [PATCH] Read webpages directly when Olostep proxy not setup This is useful for self-hosted, individual user, low traffic setups where a proxy service is not required --- pyproject.toml | 4 ++-- src/khoj/processor/tools/online_search.py | 18 ++++++++++++++++- tests/test_helpers.py | 24 +++++++++++++++++++---- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 17003c6c..63e254c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ classifiers = [ "Topic :: Text Processing :: Linguistic", ] dependencies = [ - "bs4 >= 0.0.1", + "beautifulsoup4 ~= 4.12.3", "dateparser >= 1.1.1", "defusedxml == 0.7.1", "fastapi >= 0.104.1", @@ -58,7 +58,6 @@ dependencies = [ "langchain <= 0.2.0", "langchain-openai >= 0.0.5", "requests >= 2.26.0", - "bs4 >= 0.0.1", "anyio == 3.7.1", "pymupdf >= 1.23.5", "django == 4.2.10", @@ -76,6 +75,7 @@ dependencies = [ "openai-whisper >= 20231117", "django-phonenumber-field == 7.3.0", "phonenumbers == 8.13.27", + "markdownify ~= 0.11.6", ] dynamic = ["version"] diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 33589eac..f0436e2b 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -6,6 +6,8 @@ from typing import Dict, Union import aiohttp import requests +from bs4 import BeautifulSoup +from markdownify import markdownify from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries from khoj.utils.helpers import is_none_or_empty, timer @@ -101,7 +103,7 @@ async def search_with_google(query: str, conversation_history: dict, location: L async def read_webpage_and_extract_content(subquery, url): try: with timer(f"Reading web page at '{url}' took", logger): - content = await read_webpage_with_olostep(url) + content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url) with timer(f"Extracting relevant information from web page at '{url}' took", logger): extracted_info = await extract_relevant_info(subquery, {subquery: [content.strip()]}) if content else None return subquery, extracted_info @@ -110,6 +112,20 @@ async def read_webpage_and_extract_content(subquery, url): return subquery, None +async def read_webpage(web_url: str) -> str: + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", + } + + async with aiohttp.ClientSession() as session: + async with session.get(web_url, headers=headers, timeout=30) as response: + response.raise_for_status() + html = await response.text() + parsed_html = BeautifulSoup(html, "html.parser") + body = parsed_html.body.get_text(separator="\n", strip=True) + return markdownify(body) + + async def read_webpage_with_olostep(web_url: str) -> str: headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"} web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore diff --git a/tests/test_helpers.py b/tests/test_helpers.py index e48259ad..086e4895 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -7,7 +7,7 @@ import pytest from scipy.stats import linregress from khoj.processor.embeddings import EmbeddingsModel -from khoj.processor.tools.online_search import read_webpage_with_olostep +from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep from khoj.utils import helpers @@ -84,13 +84,29 @@ def test_encode_docs_memory_leak(): assert slope < 2, f"Memory leak suspected on {device}. Memory usage increased at ~{slope:.2f} MB per iteration" -@pytest.mark.skipif(os.getenv("OLOSTEP_API_KEY") is None, reason="OLOSTEP_API_KEY is not set") -def test_olostep_api(): +@pytest.mark.asyncio +async def test_reading_webpage(): # Arrange website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire" # Act - response = read_webpage_with_olostep(website) + response = await read_webpage(website) + + # Assert + assert ( + "An alarm sent from the area near the fire also failed to register at the courthouse where the fire watchmen were" + in response + ) + + +@pytest.mark.skipif(os.getenv("OLOSTEP_API_KEY") is None, reason="OLOSTEP_API_KEY is not set") +@pytest.mark.asyncio +async def test_reading_webpage_with_olostep(): + # Arrange + website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire" + + # Act + response = await read_webpage_with_olostep(website) # Assert assert (