anything-llm/server/utils/agentFlows/executors/web-scraping.js
Timothy Carambat b6d3a411b1
Add querySelectorAll capability to web-scraping block ()
* Add `querySelectorAll` capability to web-scraping block

* patches and fallbacks

* fix styles of text in web scraping block

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
2025-02-13 16:11:15 -08:00

96 lines
3 KiB
JavaScript

const { CollectorApi } = require("../../collectorApi");
const { TokenManager } = require("../../helpers/tiktoken");
const Provider = require("../../agents/aibitat/providers/ai-provider");
const { summarizeContent } = require("../../agents/aibitat/utils/summarize");
/**
* Execute a web scraping flow step
* @param {Object} config Flow step configuration
* @param {Object} context Execution context with introspect function
* @returns {Promise<string>} Scraped content
*/
async function executeWebScraping(config, context) {
const { url, captureAs = "text" } = config;
const { introspect, model, provider } = context;
if (!url) {
throw new Error("URL is required for web scraping");
}
// Remap the captureAs to the correct mode for the CollectorApi
const captureMode = captureAs === "querySelector" ? "html" : captureAs;
introspect(`Scraping the content of ${url} as ${captureAs}`);
const { success, content } = await new CollectorApi()
.getLinkContent(url, captureMode)
.then((res) => {
if (captureAs !== "querySelector") return res;
return parseHTMLwithSelector(res.content, config.querySelector, context);
});
if (!success) {
introspect(`Could not scrape ${url}. Cannot use this page's content.`);
throw new Error("URL could not be scraped and no content was found.");
}
introspect(`Successfully scraped content from ${url}`);
if (!content || content?.length === 0) {
throw new Error("There was no content to be collected or read.");
}
const tokenCount = new TokenManager(model).countFromString(content);
const contextLimit = Provider.contextLimit(provider, model);
if (tokenCount < contextLimit) {
return content;
}
introspect(
`This page's content is way too long. I will summarize it right now.`
);
const summary = await summarizeContent({
provider,
model,
content,
});
introspect(`Successfully summarized content`);
return summary;
}
/**
* Parse HTML with a CSS selector
* @param {string} html - The HTML to parse
* @param {string|null} selector - The CSS selector to use (as text string)
* @param {{introspect: Function}} context - The context object
* @returns {Object} The parsed content
*/
function parseHTMLwithSelector(html, selector = null, context) {
if (!selector || selector.length === 0) {
context.introspect("No selector provided. Returning the entire HTML.");
return { success: true, content: html };
}
const Cheerio = require("cheerio");
const $ = Cheerio.load(html);
const selectedElements = $(selector);
let content;
if (selectedElements.length === 0) {
return { success: false, content: null };
} else if (selectedElements.length === 1) {
content = selectedElements.html();
} else {
context.introspect(
`Found ${selectedElements.length} elements matching selector: ${selector}`
);
content = selectedElements
.map((_, element) => $(element).html())
.get()
.join("\n");
}
return { success: true, content };
}
module.exports = executeWebScraping;