diff --git a/collector/index.js b/collector/index.js index 7c41002da..b307b58a4 100644 --- a/collector/index.js +++ b/collector/index.js @@ -83,9 +83,9 @@ app.post( "/util/get-link", [verifyPayloadIntegrity], async function (request, response) { - const { link } = reqBody(request); + const { link, captureAs = "text" } = reqBody(request); try { - const { success, content = null } = await getLinkText(link); + const { success, content = null } = await getLinkText(link, captureAs); response.status(200).json({ url: link, success, content }); } catch (e) { console.error(e); diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index 4afb9b954..a22166d4c 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -6,9 +6,20 @@ const { writeToServerDocuments } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function scrapeGenericUrl(link, textOnly = false) { - console.log(`-- Working URL ${link} --`); - const content = await getPageContent(link); +/** + * Scrape a generic URL and return the content in the specified format + * @param {string} link - The URL to scrape + * @param {('html' | 'text')} captureAs - The format to capture the page content as + * @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly + * @returns {Promise<Object>} - The content of the page + */ +async function scrapeGenericUrl( + link, + captureAs = "text", + processAsDocument = true +) { + console.log(`-- Working URL ${link} => (${captureAs}) --`); + const content = await getPageContent(link, captureAs); if (!content.length) { console.error(`Resulting URL content was empty at ${link}.`); @@ -19,7 +30,7 @@ async function scrapeGenericUrl(link, textOnly = false) { }; } - if (textOnly) { + if (!processAsDocument) { return { success: true, content, @@ -52,7 +63,13 @@ async function scrapeGenericUrl(link, textOnly = false) { return { success: true, reason: null, documents: [document] }; } -async function getPageContent(link) { +/** + * Get the content of a page + * @param {string} link - The URL to get the content of + * @param {('html' | 'text')} captureAs - The format to capture the page content as + * @returns {Promise<string>} - The content of the page + */ +async function getPageContent(link, captureAs = "text") { try { let pageContents = []; const loader = new PuppeteerWebBaseLoader(link, { @@ -64,7 +81,11 @@ async function getPageContent(link) { waitUntil: "networkidle2", }, async evaluate(page, browser) { - const result = await page.evaluate(() => document.body.innerText); + const result = await page.evaluate((captureAs) => { + if (captureAs === "text") return document.body.innerText; + if (captureAs === "html") return document.documentElement.innerHTML; + return document.body.innerText; + }, captureAs); await browser.close(); return result; }, diff --git a/collector/processLink/index.js b/collector/processLink/index.js index afa517cae..ac0c5916b 100644 --- a/collector/processLink/index.js +++ b/collector/processLink/index.js @@ -6,9 +6,15 @@ async function processLink(link) { return await scrapeGenericUrl(link); } -async function getLinkText(link) { +/** + * Get the text content of a link + * @param {string} link - The link to get the text content of + * @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as + * @returns {Promise<{success: boolean, content: string}>} - Response from collector + */ +async function getLinkText(link, captureAs = "text") { if (!validURL(link)) return { success: false, reason: "Not a valid URL." }; - return await scrapeGenericUrl(link, true); + return await scrapeGenericUrl(link, captureAs, false); } module.exports = { diff --git a/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx b/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx index c937f109d..f3d674aee 100644 --- a/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx +++ b/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx @@ -126,6 +126,8 @@ const BLOCK_INFO = { description: "Scrape content from a webpage", defaultConfig: { url: "", + captureAs: "text", + querySelector: "", resultVariable: "", }, getSummary: (config) => config.url || "No URL specified", diff --git a/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx b/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx index fda51e34d..76655af74 100644 --- a/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx +++ b/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx @@ -25,6 +25,48 @@ export default function WebScrapingNode({ /> </div> + <div> + <label className="block text-sm font-medium text-theme-text-primary mb-2"> + Capture Page Content As + </label> + <select + value={config.captureAs} + onChange={(e) => onConfigChange({ captureAs: e.target.value })} + className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5" + > + {[ + { label: "Text content only", value: "text" }, + { label: "Raw HTML", value: "html" }, + { label: "CSS Query Selector", value: "querySelector" }, + ].map((captureAs) => ( + <option + key={captureAs.value} + value={captureAs.value} + className="bg-theme-settings-input-bg" + > + {captureAs.label} + </option> + ))} + </select> + </div> + + {config.captureAs === "querySelector" && ( + <div> + <label className="block text-sm font-medium text-theme-text-primary mb-2"> + Query Selector + </label> + <p className="text-xs text-theme-text-secondary mb-2"> + Enter a valid CSS selector to scrape the content of the page. + </p> + <input + value={config.querySelector} + onChange={(e) => onConfigChange({ querySelector: e.target.value })} + placeholder=".article-content, #content, .main-content, etc." + className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5" + /> + </div> + )} + <div> <label className="block text-sm font-medium text-theme-text-primary mb-2"> Result Variable diff --git a/server/package.json b/server/package.json index ddde1d7ec..497a0d00a 100644 --- a/server/package.json +++ b/server/package.json @@ -43,6 +43,7 @@ "body-parser": "^1.20.2", "chalk": "^4", "check-disk-space": "^3.4.0", + "cheerio": "^1.0.0", "chromadb": "^1.5.2", "cohere-ai": "^7.9.5", "cors": "^2.8.5", diff --git a/server/utils/agentFlows/executors/web-scraping.js b/server/utils/agentFlows/executors/web-scraping.js index c2c5ccce5..6ee65349c 100644 --- a/server/utils/agentFlows/executors/web-scraping.js +++ b/server/utils/agentFlows/executors/web-scraping.js @@ -10,15 +10,22 @@ const { summarizeContent } = require("../../agents/aibitat/utils/summarize"); * @returns {Promise<string>} Scraped content */ async function executeWebScraping(config, context) { - const { url } = config; + const { url, captureAs = "text" } = config; const { introspect, model, provider } = context; if (!url) { throw new Error("URL is required for web scraping"); } - introspect(`Scraping the content of ${url}`); - const { success, content } = await new CollectorApi().getLinkContent(url); + // Remap the captureAs to the correct mode for the CollectorApi + const captureMode = captureAs === "querySelector" ? "html" : captureAs; + introspect(`Scraping the content of ${url} as ${captureAs}`); + const { success, content } = await new CollectorApi() + .getLinkContent(url, captureMode) + .then((res) => { + if (captureAs !== "querySelector") return res; + return parseHTMLwithSelector(res.content, config.querySelector, context); + }); if (!success) { introspect(`Could not scrape ${url}. Cannot use this page's content.`); @@ -52,4 +59,38 @@ async function executeWebScraping(config, context) { return summary; } +/** + * Parse HTML with a CSS selector + * @param {string} html - The HTML to parse + * @param {string|null} selector - The CSS selector to use (as text string) + * @param {{introspect: Function}} context - The context object + * @returns {Object} The parsed content + */ +function parseHTMLwithSelector(html, selector = null, context) { + if (!selector || selector.length === 0) { + context.introspect("No selector provided. Returning the entire HTML."); + return { success: true, content: html }; + } + + const Cheerio = require("cheerio"); + const $ = Cheerio.load(html); + const selectedElements = $(selector); + + let content; + if (selectedElements.length === 0) { + return { success: false, content: null }; + } else if (selectedElements.length === 1) { + content = selectedElements.html(); + } else { + context.introspect( + `Found ${selectedElements.length} elements matching selector: ${selector}` + ); + content = selectedElements + .map((_, element) => $(element).html()) + .get() + .join("\n"); + } + return { success: true, content }; +} + module.exports = executeWebScraping; diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index 7f5781918..22e2bcd9d 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -148,10 +148,10 @@ class CollectorApi { }); } - async getLinkContent(link = "") { + async getLinkContent(link = "", captureAs = "text") { if (!link) return false; - const data = JSON.stringify({ link }); + const data = JSON.stringify({ link, captureAs }); return await fetch(`${this.endpoint}/util/get-link`, { method: "POST", headers: { diff --git a/server/yarn.lock b/server/yarn.lock index 6cfabc83a..90e5e5749 100644 --- a/server/yarn.lock +++ b/server/yarn.lock @@ -2490,6 +2490,35 @@ check-disk-space@^3.4.0: resolved "https://registry.yarnpkg.com/check-disk-space/-/check-disk-space-3.4.0.tgz#eb8e69eee7a378fd12e35281b8123a8b4c4a8ff7" integrity sha512-drVkSqfwA+TvuEhFipiR1OC9boEGZL5RrWvVsOthdcvQNXyCCuKkEiTOTXZ7qxSf/GLwq4GvzfrQD/Wz325hgw== +cheerio-select@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/cheerio-select/-/cheerio-select-2.1.0.tgz#4d8673286b8126ca2a8e42740d5e3c4884ae21b4" + integrity sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g== + dependencies: + boolbase "^1.0.0" + css-select "^5.1.0" + css-what "^6.1.0" + domelementtype "^2.3.0" + domhandler "^5.0.3" + domutils "^3.0.1" + +cheerio@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0.tgz#1ede4895a82f26e8af71009f961a9b8cb60d6a81" + integrity sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww== + dependencies: + cheerio-select "^2.1.0" + dom-serializer "^2.0.0" + domhandler "^5.0.3" + domutils "^3.1.0" + encoding-sniffer "^0.2.0" + htmlparser2 "^9.1.0" + parse5 "^7.1.2" + parse5-htmlparser2-tree-adapter "^7.0.0" + parse5-parser-stream "^7.1.2" + undici "^6.19.5" + whatwg-mimetype "^4.0.0" + chokidar@^3.5.2: version "3.6.0" resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.6.0.tgz#197c6cc669ef2a8dc5e7b4d97ee4e092c3eb0d5b" @@ -2963,6 +2992,15 @@ domutils@^3.0.1: domelementtype "^2.3.0" domhandler "^5.0.3" +domutils@^3.1.0: + version "3.2.2" + resolved "https://registry.yarnpkg.com/domutils/-/domutils-3.2.2.tgz#edbfe2b668b0c1d97c24baf0f1062b132221bc78" + integrity sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw== + dependencies: + dom-serializer "^2.0.0" + domelementtype "^2.3.0" + domhandler "^5.0.3" + dotenv@^16.0.3: version "16.4.5" resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-16.4.5.tgz#cdd3b3b604cb327e286b4762e13502f717cb099f" @@ -3012,6 +3050,14 @@ encodeurl@~1.0.2: resolved "https://registry.yarnpkg.com/encodeurl/-/encodeurl-1.0.2.tgz#ad3ff4c86ec2d029322f5a02c3a9a606c95b3f59" integrity sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w== +encoding-sniffer@^0.2.0: + version "0.2.0" + resolved "https://registry.yarnpkg.com/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz#799569d66d443babe82af18c9f403498365ef1d5" + integrity sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg== + dependencies: + iconv-lite "^0.6.3" + whatwg-encoding "^3.1.1" + encoding@^0.1.13: version "0.1.13" resolved "https://registry.yarnpkg.com/encoding/-/encoding-0.1.13.tgz#56574afdd791f54a8e9b2785c0582a2d26210fa9" @@ -3026,7 +3072,7 @@ end-of-stream@^1.1.0, end-of-stream@^1.4.1: dependencies: once "^1.4.0" -entities@^4.2.0: +entities@^4.2.0, entities@^4.5.0: version "4.5.0" resolved "https://registry.yarnpkg.com/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48" integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw== @@ -3882,6 +3928,16 @@ hermes-parser@0.20.1: dependencies: hermes-estree "0.20.1" +htmlparser2@^9.1.0: + version "9.1.0" + resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-9.1.0.tgz#cdb498d8a75a51f739b61d3f718136c369bc8c23" + integrity sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ== + dependencies: + domelementtype "^2.3.0" + domhandler "^5.0.3" + domutils "^3.1.0" + entities "^4.5.0" + http-errors@2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/http-errors/-/http-errors-2.0.0.tgz#b7774a1486ef73cf7667ac9ae0858c012c57b9d3" @@ -3943,7 +3999,7 @@ iconv-lite@0.4.24, iconv-lite@^0.4.24: dependencies: safer-buffer ">= 2.1.2 < 3" -iconv-lite@^0.6.2, iconv-lite@^0.6.3: +iconv-lite@0.6.3, iconv-lite@^0.6.2, iconv-lite@^0.6.3: version "0.6.3" resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501" integrity sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw== @@ -5342,6 +5398,28 @@ parent-module@^1.0.0: dependencies: callsites "^3.0.0" +parse5-htmlparser2-tree-adapter@^7.0.0: + version "7.1.0" + resolved "https://registry.yarnpkg.com/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz#b5a806548ed893a43e24ccb42fbb78069311e81b" + integrity sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g== + dependencies: + domhandler "^5.0.3" + parse5 "^7.0.0" + +parse5-parser-stream@^7.1.2: + version "7.1.2" + resolved "https://registry.yarnpkg.com/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz#d7c20eadc37968d272e2c02660fff92dd27e60e1" + integrity sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow== + dependencies: + parse5 "^7.0.0" + +parse5@^7.0.0, parse5@^7.1.2: + version "7.2.1" + resolved "https://registry.yarnpkg.com/parse5/-/parse5-7.2.1.tgz#8928f55915e6125f430cc44309765bf17556a33a" + integrity sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ== + dependencies: + entities "^4.5.0" + parseurl@~1.3.3: version "1.3.3" resolved "https://registry.yarnpkg.com/parseurl/-/parseurl-1.3.3.tgz#9da19e7bee8d12dff0513ed5b76957793bc2e8d4" @@ -6482,6 +6560,11 @@ undici-types@~5.26.4: resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.26.5.tgz#bcd539893d00b56e964fd2657a4866b221a65617" integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA== +undici@^6.19.5: + version "6.21.1" + resolved "https://registry.yarnpkg.com/undici/-/undici-6.21.1.tgz#336025a14162e6837e44ad7b819b35b6c6af0e05" + integrity sha512-q/1rj5D0/zayJB2FraXdaWxbhWiNKDvu8naDT2dl1yTlvJp4BLtOcp2a5BvgGNQpYYJzau7tf1WgKv3b+7mqpQ== + undici@~5.28.4: version "5.28.4" resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.4.tgz#6b280408edb6a1a604a9b20340f45b422e373068" @@ -6580,11 +6663,23 @@ webidl-conversions@^3.0.0: resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871" integrity sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ== +whatwg-encoding@^3.1.1: + version "3.1.1" + resolved "https://registry.yarnpkg.com/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz#d0f4ef769905d426e1688f3e34381a99b60b76e5" + integrity sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ== + dependencies: + iconv-lite "0.6.3" + whatwg-fetch@^3.4.1, whatwg-fetch@^3.6.20: version "3.6.20" resolved "https://registry.yarnpkg.com/whatwg-fetch/-/whatwg-fetch-3.6.20.tgz#580ce6d791facec91d37c72890995a0b48d31c70" integrity sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg== +whatwg-mimetype@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz#bc1bf94a985dc50388d54a9258ac405c3ca2fc0a" + integrity sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg== + whatwg-url@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d"