mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Limit the number of urls the webscraper can extract for scraping
This commit is contained in:
parent
327fcb8f62
commit
a213b593e8
1 changed files with 5 additions and 0 deletions
|
@ -54,6 +54,7 @@ OLOSTEP_QUERY_PARAMS = {
|
|||
}
|
||||
|
||||
DEFAULT_MAX_WEBPAGES_TO_READ = 1
|
||||
MAX_WEBPAGES_TO_INFER = 10
|
||||
|
||||
|
||||
async def search_online(
|
||||
|
@ -157,6 +158,7 @@ async def read_webpages(
|
|||
query_images: List[str] = None,
|
||||
agent: Agent = None,
|
||||
tracer: dict = {},
|
||||
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
|
||||
):
|
||||
"Infer web pages to read from the query and extract relevant information from them"
|
||||
logger.info(f"Inferring web pages to read")
|
||||
|
@ -165,6 +167,9 @@ async def read_webpages(
|
|||
yield {ChatEvent.STATUS: event}
|
||||
urls = await infer_webpage_urls(query, conversation_history, location, user, query_images)
|
||||
|
||||
# Get the top 10 web pages to read
|
||||
urls = urls[:max_webpages_to_read]
|
||||
|
||||
logger.info(f"Reading web pages at: {urls}")
|
||||
if send_status_func:
|
||||
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
||||
|
|
Loading…
Reference in a new issue