Limit the number of urls the webscraper can extract for scraping

This commit is contained in:
sabaimran 2024-11-01 16:48:36 -07:00
parent 327fcb8f62
commit a213b593e8

View file

@ -54,6 +54,7 @@ OLOSTEP_QUERY_PARAMS = {
} }
DEFAULT_MAX_WEBPAGES_TO_READ = 1 DEFAULT_MAX_WEBPAGES_TO_READ = 1
MAX_WEBPAGES_TO_INFER = 10
async def search_online( async def search_online(
@ -157,6 +158,7 @@ async def read_webpages(
query_images: List[str] = None, query_images: List[str] = None,
agent: Agent = None, agent: Agent = None,
tracer: dict = {}, tracer: dict = {},
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
): ):
"Infer web pages to read from the query and extract relevant information from them" "Infer web pages to read from the query and extract relevant information from them"
logger.info(f"Inferring web pages to read") logger.info(f"Inferring web pages to read")
@ -165,6 +167,9 @@ async def read_webpages(
yield {ChatEvent.STATUS: event} yield {ChatEvent.STATUS: event}
urls = await infer_webpage_urls(query, conversation_history, location, user, query_images) urls = await infer_webpage_urls(query, conversation_history, location, user, query_images)
# Get the top 10 web pages to read
urls = urls[:max_webpages_to_read]
logger.info(f"Reading web pages at: {urls}") logger.info(f"Reading web pages at: {urls}")
if send_status_func: if send_status_func:
webpage_links_str = "\n- " + "\n- ".join(list(urls)) webpage_links_str = "\n- " + "\n- ".join(list(urls))