mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 09:25:06 +01:00
Limit the number of urls the webscraper can extract for scraping
This commit is contained in:
parent
327fcb8f62
commit
a213b593e8
1 changed files with 5 additions and 0 deletions
|
@ -54,6 +54,7 @@ OLOSTEP_QUERY_PARAMS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_MAX_WEBPAGES_TO_READ = 1
|
DEFAULT_MAX_WEBPAGES_TO_READ = 1
|
||||||
|
MAX_WEBPAGES_TO_INFER = 10
|
||||||
|
|
||||||
|
|
||||||
async def search_online(
|
async def search_online(
|
||||||
|
@ -157,6 +158,7 @@ async def read_webpages(
|
||||||
query_images: List[str] = None,
|
query_images: List[str] = None,
|
||||||
agent: Agent = None,
|
agent: Agent = None,
|
||||||
tracer: dict = {},
|
tracer: dict = {},
|
||||||
|
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
|
||||||
):
|
):
|
||||||
"Infer web pages to read from the query and extract relevant information from them"
|
"Infer web pages to read from the query and extract relevant information from them"
|
||||||
logger.info(f"Inferring web pages to read")
|
logger.info(f"Inferring web pages to read")
|
||||||
|
@ -165,6 +167,9 @@ async def read_webpages(
|
||||||
yield {ChatEvent.STATUS: event}
|
yield {ChatEvent.STATUS: event}
|
||||||
urls = await infer_webpage_urls(query, conversation_history, location, user, query_images)
|
urls = await infer_webpage_urls(query, conversation_history, location, user, query_images)
|
||||||
|
|
||||||
|
# Get the top 10 web pages to read
|
||||||
|
urls = urls[:max_webpages_to_read]
|
||||||
|
|
||||||
logger.info(f"Reading web pages at: {urls}")
|
logger.info(f"Reading web pages at: {urls}")
|
||||||
if send_status_func:
|
if send_status_func:
|
||||||
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
||||||
|
|
Loading…
Reference in a new issue