anything-llm/server/utils/agentFlows/executors/web-scraping.js

const { CollectorApi } = require("../../collectorApi");
const { TokenManager } = require("../../helpers/tiktoken");
const Provider = require("../../agents/aibitat/providers/ai-provider");
const { summarizeContent } = require("../../agents/aibitat/utils/summarize");

/**
 * Execute a web scraping flow step
 * @param {Object} config Flow step configuration
 * @param {Object} context Execution context with introspect function
 * @returns {Promise<string>} Scraped content
 */
async function executeWebScraping(config, context) {
  const { url, captureAs = "text" } = config;
  const { introspect, model, provider } = context;

  if (!url) {
    throw new Error("URL is required for web scraping");
  }

  // Remap the captureAs to the correct mode for the CollectorApi
  const captureMode = captureAs === "querySelector" ? "html" : captureAs;
  introspect(`Scraping the content of ${url} as ${captureAs}`);
  const { success, content } = await new CollectorApi()
    .getLinkContent(url, captureMode)
    .then((res) => {
      if (captureAs !== "querySelector") return res;
      return parseHTMLwithSelector(res.content, config.querySelector, context);
    });

  if (!success) {
    introspect(`Could not scrape ${url}. Cannot use this page's content.`);
    throw new Error("URL could not be scraped and no content was found.");
  }

  introspect(`Successfully scraped content from ${url}`);

  if (!content || content?.length === 0) {
    throw new Error("There was no content to be collected or read.");
  }

  const tokenCount = new TokenManager(model).countFromString(content);
  const contextLimit = Provider.contextLimit(provider, model);

  if (tokenCount < contextLimit) {
    return content;
  }

  introspect(
    `This page's content is way too long. I will summarize it right now.`
  );
  const summary = await summarizeContent({
    provider,
    model,
    content,
  });

  introspect(`Successfully summarized content`);

  return summary;
}

/**
 * Parse HTML with a CSS selector
 * @param {string} html - The HTML to parse
 * @param {string|null} selector - The CSS selector to use (as text string)
 * @param {{introspect: Function}} context - The context object
 * @returns {Object} The parsed content
 */
function parseHTMLwithSelector(html, selector = null, context) {
  if (!selector || selector.length === 0) {
    context.introspect("No selector provided. Returning the entire HTML.");
    return { success: true, content: html };
  }

  const Cheerio = require("cheerio");
  const $ = Cheerio.load(html);
  const selectedElements = $(selector);

  let content;
  if (selectedElements.length === 0) {
    return { success: false, content: null };
  } else if (selectedElements.length === 1) {
    content = selectedElements.html();
  } else {
    context.introspect(
      `Found ${selectedElements.length} elements matching selector: ${selector}`
    );
    content = selectedElements
      .map((_, element) => $(element).html())
      .get()
      .join("\n");
  }
  return { success: true, content };
}

module.exports = executeWebScraping;
Agent flow builder (#3077) * wip agent builder * refactor structure for agent builder * improve ui for add block menu and sidebar * lint * node ui improvement * handle deleting variable in all nodes * add headers and body to apiCall node * lint * Agent flow builder backend (#3078) * wip agent builder backend * save/load agent tasks * lint * refactor agent task to use uuids instead of names * placeholder for run task * update frontend sidebar + seperate backend to agent-tasks utils * lint * add deleting of agent tasks * create AgentTasks class + wip load agent tasks into aibitat * lint * inject + call agent tasks * wip call agent tasks * add llm instruction + fix api calling blocks * add ui + backend for editing/toggling agent tasks * lint * add back middlewares * disable run task + add navigate to home on logo click * implement normalizePath to prevent path traversal * wip make api calling more consistent * lint * rename all references from task to flow * patch load flow bug when on editing page * remove unneeded files/comments * lint * fix delete endpoint + rename load flows * add move block to ui + fix api-call backend + add telemetry * lint * add web scraping block * only allow admin for agent builder --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * Move AgentFlowManager flows to static simplify UI states Handle LLM prompt flow when provided non-string * delete/edit menu for agent flow panel + update flow icon * lint * fix open builder button hidden bug * add tooltips to move up/down block buttons * add tooltip to delete block * truncate block description to fit on blocklist component * light mode agent builder sidebar * light mode api call block * fix light mode styles for agent builder blocks * agent flow fetch in UI * sync delete flow * agent flow ui/ux improvements * remove unused AgentSidebar component * comment out /run * UI changes and updates for flow builder * format flow panel info * update link handling * ui tweaks to header menu * remove unused import * update doc links update block icons * bump readme * Patch code block header oddity resolves #3117 * bump dev image --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> 2025-02-13 08:50:43 +08:00			`const { CollectorApi } = require("../../collectorApi");`
			`const { TokenManager } = require("../../helpers/tiktoken");`
			`const Provider = require("../../agents/aibitat/providers/ai-provider");`
			`const { summarizeContent } = require("../../agents/aibitat/utils/summarize");`

			`/**`
			`* Execute a web scraping flow step`
			`* @param {Object} config Flow step configuration`
			`* @param {Object} context Execution context with introspect function`
			`* @returns {Promise<string>} Scraped content`
			`*/`
			`async function executeWebScraping(config, context) {`
Add `querySelectorAll` capability to web-scraping block (#3186) * Add `querySelectorAll` capability to web-scraping block * patches and fallbacks * fix styles of text in web scraping block --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> 2025-02-13 16:11:15 -08:00			`const { url, captureAs = "text" } = config;`
Agent flow builder (#3077) * wip agent builder * refactor structure for agent builder * improve ui for add block menu and sidebar * lint * node ui improvement * handle deleting variable in all nodes * add headers and body to apiCall node * lint * Agent flow builder backend (#3078) * wip agent builder backend * save/load agent tasks * lint * refactor agent task to use uuids instead of names * placeholder for run task * update frontend sidebar + seperate backend to agent-tasks utils * lint * add deleting of agent tasks * create AgentTasks class + wip load agent tasks into aibitat * lint * inject + call agent tasks * wip call agent tasks * add llm instruction + fix api calling blocks * add ui + backend for editing/toggling agent tasks * lint * add back middlewares * disable run task + add navigate to home on logo click * implement normalizePath to prevent path traversal * wip make api calling more consistent * lint * rename all references from task to flow * patch load flow bug when on editing page * remove unneeded files/comments * lint * fix delete endpoint + rename load flows * add move block to ui + fix api-call backend + add telemetry * lint * add web scraping block * only allow admin for agent builder --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * Move AgentFlowManager flows to static simplify UI states Handle LLM prompt flow when provided non-string * delete/edit menu for agent flow panel + update flow icon * lint * fix open builder button hidden bug * add tooltips to move up/down block buttons * add tooltip to delete block * truncate block description to fit on blocklist component * light mode agent builder sidebar * light mode api call block * fix light mode styles for agent builder blocks * agent flow fetch in UI * sync delete flow * agent flow ui/ux improvements * remove unused AgentSidebar component * comment out /run * UI changes and updates for flow builder * format flow panel info * update link handling * ui tweaks to header menu * remove unused import * update doc links update block icons * bump readme * Patch code block header oddity resolves #3117 * bump dev image --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> 2025-02-13 08:50:43 +08:00			`const { introspect, model, provider } = context;`

			`if (!url) {`
			`throw new Error("URL is required for web scraping");`
			`}`

Add `querySelectorAll` capability to web-scraping block (#3186) * Add `querySelectorAll` capability to web-scraping block * patches and fallbacks * fix styles of text in web scraping block --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> 2025-02-13 16:11:15 -08:00			`// Remap the captureAs to the correct mode for the CollectorApi`
			`const captureMode = captureAs === "querySelector" ? "html" : captureAs;`
			introspect(`Scraping the content of ${url} as ${captureAs}`);
			`const { success, content } = await new CollectorApi()`
			`.getLinkContent(url, captureMode)`
			`.then((res) => {`
			`if (captureAs !== "querySelector") return res;`
			`return parseHTMLwithSelector(res.content, config.querySelector, context);`
			`});`
Agent flow builder (#3077) * wip agent builder * refactor structure for agent builder * improve ui for add block menu and sidebar * lint * node ui improvement * handle deleting variable in all nodes * add headers and body to apiCall node * lint * Agent flow builder backend (#3078) * wip agent builder backend * save/load agent tasks * lint * refactor agent task to use uuids instead of names * placeholder for run task * update frontend sidebar + seperate backend to agent-tasks utils * lint * add deleting of agent tasks * create AgentTasks class + wip load agent tasks into aibitat * lint * inject + call agent tasks * wip call agent tasks * add llm instruction + fix api calling blocks * add ui + backend for editing/toggling agent tasks * lint * add back middlewares * disable run task + add navigate to home on logo click * implement normalizePath to prevent path traversal * wip make api calling more consistent * lint * rename all references from task to flow * patch load flow bug when on editing page * remove unneeded files/comments * lint * fix delete endpoint + rename load flows * add move block to ui + fix api-call backend + add telemetry * lint * add web scraping block * only allow admin for agent builder --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * Move AgentFlowManager flows to static simplify UI states Handle LLM prompt flow when provided non-string * delete/edit menu for agent flow panel + update flow icon * lint * fix open builder button hidden bug * add tooltips to move up/down block buttons * add tooltip to delete block * truncate block description to fit on blocklist component * light mode agent builder sidebar * light mode api call block * fix light mode styles for agent builder blocks * agent flow fetch in UI * sync delete flow * agent flow ui/ux improvements * remove unused AgentSidebar component * comment out /run * UI changes and updates for flow builder * format flow panel info * update link handling * ui tweaks to header menu * remove unused import * update doc links update block icons * bump readme * Patch code block header oddity resolves #3117 * bump dev image --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> 2025-02-13 08:50:43 +08:00
			`if (!success) {`
			introspect(`Could not scrape ${url}. Cannot use this page's content.`);
			`throw new Error("URL could not be scraped and no content was found.");`
			`}`

			introspect(`Successfully scraped content from ${url}`);

			`if (!content \|\| content?.length === 0) {`
			`throw new Error("There was no content to be collected or read.");`
			`}`

			`const tokenCount = new TokenManager(model).countFromString(content);`
			`const contextLimit = Provider.contextLimit(provider, model);`

			`if (tokenCount < contextLimit) {`
			`return content;`
			`}`

			`introspect(`
			`This page's content is way too long. I will summarize it right now.`
			`);`
			`const summary = await summarizeContent({`
			`provider,`
			`model,`
			`content,`
			`});`

			introspect(`Successfully summarized content`);

			`return summary;`
			`}`

Add `querySelectorAll` capability to web-scraping block (#3186) * Add `querySelectorAll` capability to web-scraping block * patches and fallbacks * fix styles of text in web scraping block --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> 2025-02-13 16:11:15 -08:00			`/**`
			`* Parse HTML with a CSS selector`
			`* @param {string} html - The HTML to parse`
			`* @param {string\|null} selector - The CSS selector to use (as text string)`
			`* @param {{introspect: Function}} context - The context object`
			`* @returns {Object} The parsed content`
			`*/`
			`function parseHTMLwithSelector(html, selector = null, context) {`
			`if (!selector \|\| selector.length === 0) {`
			`context.introspect("No selector provided. Returning the entire HTML.");`
			`return { success: true, content: html };`
			`}`

			`const Cheerio = require("cheerio");`
			`const $ = Cheerio.load(html);`
			`const selectedElements = $(selector);`

			`let content;`
			`if (selectedElements.length === 0) {`
			`return { success: false, content: null };`
			`} else if (selectedElements.length === 1) {`
			`content = selectedElements.html();`
			`} else {`
			`context.introspect(`
			`Found ${selectedElements.length} elements matching selector: ${selector}`
			`);`
			`content = selectedElements`
			`.map((_, element) => $(element).html())`
			`.get()`
			`.join("\n");`
			`}`
			`return { success: true, content };`
			`}`

Agent flow builder (#3077) * wip agent builder * refactor structure for agent builder * improve ui for add block menu and sidebar * lint * node ui improvement * handle deleting variable in all nodes * add headers and body to apiCall node * lint * Agent flow builder backend (#3078) * wip agent builder backend * save/load agent tasks * lint * refactor agent task to use uuids instead of names * placeholder for run task * update frontend sidebar + seperate backend to agent-tasks utils * lint * add deleting of agent tasks * create AgentTasks class + wip load agent tasks into aibitat * lint * inject + call agent tasks * wip call agent tasks * add llm instruction + fix api calling blocks * add ui + backend for editing/toggling agent tasks * lint * add back middlewares * disable run task + add navigate to home on logo click * implement normalizePath to prevent path traversal * wip make api calling more consistent * lint * rename all references from task to flow * patch load flow bug when on editing page * remove unneeded files/comments * lint * fix delete endpoint + rename load flows * add move block to ui + fix api-call backend + add telemetry * lint * add web scraping block * only allow admin for agent builder --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * Move AgentFlowManager flows to static simplify UI states Handle LLM prompt flow when provided non-string * delete/edit menu for agent flow panel + update flow icon * lint * fix open builder button hidden bug * add tooltips to move up/down block buttons * add tooltip to delete block * truncate block description to fit on blocklist component * light mode agent builder sidebar * light mode api call block * fix light mode styles for agent builder blocks * agent flow fetch in UI * sync delete flow * agent flow ui/ux improvements * remove unused AgentSidebar component * comment out /run * UI changes and updates for flow builder * format flow panel info * update link handling * ui tweaks to header menu * remove unused import * update doc links update block icons * bump readme * Patch code block header oddity resolves #3117 * bump dev image --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> 2025-02-13 08:50:43 +08:00			`module.exports = executeWebScraping;`
No results found.