From b6d3a411b1ccb5bbcecfa4a8856b592ccf9cc2b2 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Thu, 13 Feb 2025 16:11:15 -0800
Subject: [PATCH] Add `querySelectorAll` capability to web-scraping block
 (#3186)

* Add `querySelectorAll` capability to web-scraping block

* patches and fallbacks

* fix styles of text in web scraping block

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
---
 collector/index.js                            |  4 +-
 collector/processLink/convert/generic.js      | 33 +++++--
 collector/processLink/index.js                | 10 +-
 .../Admin/AgentBuilder/BlockList/index.jsx    |  2 +
 .../nodes/WebScrapingNode/index.jsx           | 42 ++++++++
 server/package.json                           |  1 +
 .../agentFlows/executors/web-scraping.js      | 47 ++++++++-
 server/utils/collectorApi/index.js            |  4 +-
 server/yarn.lock                              | 99 ++++++++++++++++++-
 9 files changed, 225 insertions(+), 17 deletions(-)

diff --git a/collector/index.js b/collector/index.js
index 7c41002da..b307b58a4 100644
--- a/collector/index.js
+++ b/collector/index.js
@@ -83,9 +83,9 @@ app.post(
   "/util/get-link",
   [verifyPayloadIntegrity],
   async function (request, response) {
-    const { link } = reqBody(request);
+    const { link, captureAs = "text" } = reqBody(request);
     try {
-      const { success, content = null } = await getLinkText(link);
+      const { success, content = null } = await getLinkText(link, captureAs);
       response.status(200).json({ url: link, success, content });
     } catch (e) {
       console.error(e);
diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
index 4afb9b954..a22166d4c 100644
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@@ -6,9 +6,20 @@ const { writeToServerDocuments } = require("../../utils/files");
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function scrapeGenericUrl(link, textOnly = false) {
-  console.log(`-- Working URL ${link} --`);
-  const content = await getPageContent(link);
+/**
+ * Scrape a generic URL and return the content in the specified format
+ * @param {string} link - The URL to scrape
+ * @param {('html' | 'text')} captureAs - The format to capture the page content as
+ * @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly
+ * @returns {Promise<Object>} - The content of the page
+ */
+async function scrapeGenericUrl(
+  link,
+  captureAs = "text",
+  processAsDocument = true
+) {
+  console.log(`-- Working URL ${link} => (${captureAs}) --`);
+  const content = await getPageContent(link, captureAs);
 
   if (!content.length) {
     console.error(`Resulting URL content was empty at ${link}.`);
@@ -19,7 +30,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
     };
   }
 
-  if (textOnly) {
+  if (!processAsDocument) {
     return {
       success: true,
       content,
@@ -52,7 +63,13 @@ async function scrapeGenericUrl(link, textOnly = false) {
   return { success: true, reason: null, documents: [document] };
 }
 
-async function getPageContent(link) {
+/**
+ * Get the content of a page
+ * @param {string} link - The URL to get the content of
+ * @param {('html' | 'text')} captureAs - The format to capture the page content as
+ * @returns {Promise<string>} - The content of the page
+ */
+async function getPageContent(link, captureAs = "text") {
   try {
     let pageContents = [];
     const loader = new PuppeteerWebBaseLoader(link, {
@@ -64,7 +81,11 @@ async function getPageContent(link) {
         waitUntil: "networkidle2",
       },
       async evaluate(page, browser) {
-        const result = await page.evaluate(() => document.body.innerText);
+        const result = await page.evaluate((captureAs) => {
+          if (captureAs === "text") return document.body.innerText;
+          if (captureAs === "html") return document.documentElement.innerHTML;
+          return document.body.innerText;
+        }, captureAs);
         await browser.close();
         return result;
       },
diff --git a/collector/processLink/index.js b/collector/processLink/index.js
index afa517cae..ac0c5916b 100644
--- a/collector/processLink/index.js
+++ b/collector/processLink/index.js
@@ -6,9 +6,15 @@ async function processLink(link) {
   return await scrapeGenericUrl(link);
 }
 
-async function getLinkText(link) {
+/**
+ * Get the text content of a link
+ * @param {string} link - The link to get the text content of
+ * @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as
+ * @returns {Promise<{success: boolean, content: string}>} - Response from collector
+ */
+async function getLinkText(link, captureAs = "text") {
   if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
-  return await scrapeGenericUrl(link, true);
+  return await scrapeGenericUrl(link, captureAs, false);
 }
 
 module.exports = {
diff --git a/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx b/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx
index c937f109d..f3d674aee 100644
--- a/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx
+++ b/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx
@@ -126,6 +126,8 @@ const BLOCK_INFO = {
     description: "Scrape content from a webpage",
     defaultConfig: {
       url: "",
+      captureAs: "text",
+      querySelector: "",
       resultVariable: "",
     },
     getSummary: (config) => config.url || "No URL specified",
diff --git a/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx b/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx
index fda51e34d..76655af74 100644
--- a/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx
+++ b/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx
@@ -25,6 +25,48 @@ export default function WebScrapingNode({
         />
       </div>
 
+      <div>
+        <label className="block text-sm font-medium text-theme-text-primary mb-2">
+          Capture Page Content As
+        </label>
+        <select
+          value={config.captureAs}
+          onChange={(e) => onConfigChange({ captureAs: e.target.value })}
+          className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5"
+        >
+          {[
+            { label: "Text content only", value: "text" },
+            { label: "Raw HTML", value: "html" },
+            { label: "CSS Query Selector", value: "querySelector" },
+          ].map((captureAs) => (
+            <option
+              key={captureAs.value}
+              value={captureAs.value}
+              className="bg-theme-settings-input-bg"
+            >
+              {captureAs.label}
+            </option>
+          ))}
+        </select>
+      </div>
+
+      {config.captureAs === "querySelector" && (
+        <div>
+          <label className="block text-sm font-medium text-theme-text-primary mb-2">
+            Query Selector
+          </label>
+          <p className="text-xs text-theme-text-secondary mb-2">
+            Enter a valid CSS selector to scrape the content of the page.
+          </p>
+          <input
+            value={config.querySelector}
+            onChange={(e) => onConfigChange({ querySelector: e.target.value })}
+            placeholder=".article-content, #content, .main-content, etc."
+            className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5"
+          />
+        </div>
+      )}
+
       <div>
         <label className="block text-sm font-medium text-theme-text-primary mb-2">
           Result Variable
diff --git a/server/package.json b/server/package.json
index ddde1d7ec..497a0d00a 100644
--- a/server/package.json
+++ b/server/package.json
@@ -43,6 +43,7 @@
     "body-parser": "^1.20.2",
     "chalk": "^4",
     "check-disk-space": "^3.4.0",
+    "cheerio": "^1.0.0",
     "chromadb": "^1.5.2",
     "cohere-ai": "^7.9.5",
     "cors": "^2.8.5",
diff --git a/server/utils/agentFlows/executors/web-scraping.js b/server/utils/agentFlows/executors/web-scraping.js
index c2c5ccce5..6ee65349c 100644
--- a/server/utils/agentFlows/executors/web-scraping.js
+++ b/server/utils/agentFlows/executors/web-scraping.js
@@ -10,15 +10,22 @@ const { summarizeContent } = require("../../agents/aibitat/utils/summarize");
  * @returns {Promise<string>} Scraped content
  */
 async function executeWebScraping(config, context) {
-  const { url } = config;
+  const { url, captureAs = "text" } = config;
   const { introspect, model, provider } = context;
 
   if (!url) {
     throw new Error("URL is required for web scraping");
   }
 
-  introspect(`Scraping the content of ${url}`);
-  const { success, content } = await new CollectorApi().getLinkContent(url);
+  // Remap the captureAs to the correct mode for the CollectorApi
+  const captureMode = captureAs === "querySelector" ? "html" : captureAs;
+  introspect(`Scraping the content of ${url} as ${captureAs}`);
+  const { success, content } = await new CollectorApi()
+    .getLinkContent(url, captureMode)
+    .then((res) => {
+      if (captureAs !== "querySelector") return res;
+      return parseHTMLwithSelector(res.content, config.querySelector, context);
+    });
 
   if (!success) {
     introspect(`Could not scrape ${url}. Cannot use this page's content.`);
@@ -52,4 +59,38 @@ async function executeWebScraping(config, context) {
   return summary;
 }
 
+/**
+ * Parse HTML with a CSS selector
+ * @param {string} html - The HTML to parse
+ * @param {string|null} selector - The CSS selector to use (as text string)
+ * @param {{introspect: Function}} context - The context object
+ * @returns {Object} The parsed content
+ */
+function parseHTMLwithSelector(html, selector = null, context) {
+  if (!selector || selector.length === 0) {
+    context.introspect("No selector provided. Returning the entire HTML.");
+    return { success: true, content: html };
+  }
+
+  const Cheerio = require("cheerio");
+  const $ = Cheerio.load(html);
+  const selectedElements = $(selector);
+
+  let content;
+  if (selectedElements.length === 0) {
+    return { success: false, content: null };
+  } else if (selectedElements.length === 1) {
+    content = selectedElements.html();
+  } else {
+    context.introspect(
+      `Found ${selectedElements.length} elements matching selector: ${selector}`
+    );
+    content = selectedElements
+      .map((_, element) => $(element).html())
+      .get()
+      .join("\n");
+  }
+  return { success: true, content };
+}
+
 module.exports = executeWebScraping;
diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js
index 7f5781918..22e2bcd9d 100644
--- a/server/utils/collectorApi/index.js
+++ b/server/utils/collectorApi/index.js
@@ -148,10 +148,10 @@ class CollectorApi {
       });
   }
 
-  async getLinkContent(link = "") {
+  async getLinkContent(link = "", captureAs = "text") {
     if (!link) return false;
 
-    const data = JSON.stringify({ link });
+    const data = JSON.stringify({ link, captureAs });
     return await fetch(`${this.endpoint}/util/get-link`, {
       method: "POST",
       headers: {
diff --git a/server/yarn.lock b/server/yarn.lock
index 6cfabc83a..90e5e5749 100644
--- a/server/yarn.lock
+++ b/server/yarn.lock
@@ -2490,6 +2490,35 @@ check-disk-space@^3.4.0:
   resolved "https://registry.yarnpkg.com/check-disk-space/-/check-disk-space-3.4.0.tgz#eb8e69eee7a378fd12e35281b8123a8b4c4a8ff7"
   integrity sha512-drVkSqfwA+TvuEhFipiR1OC9boEGZL5RrWvVsOthdcvQNXyCCuKkEiTOTXZ7qxSf/GLwq4GvzfrQD/Wz325hgw==
 
+cheerio-select@^2.1.0:
+  version "2.1.0"
+  resolved "https://registry.yarnpkg.com/cheerio-select/-/cheerio-select-2.1.0.tgz#4d8673286b8126ca2a8e42740d5e3c4884ae21b4"
+  integrity sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==
+  dependencies:
+    boolbase "^1.0.0"
+    css-select "^5.1.0"
+    css-what "^6.1.0"
+    domelementtype "^2.3.0"
+    domhandler "^5.0.3"
+    domutils "^3.0.1"
+
+cheerio@^1.0.0:
+  version "1.0.0"
+  resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0.tgz#1ede4895a82f26e8af71009f961a9b8cb60d6a81"
+  integrity sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww==
+  dependencies:
+    cheerio-select "^2.1.0"
+    dom-serializer "^2.0.0"
+    domhandler "^5.0.3"
+    domutils "^3.1.0"
+    encoding-sniffer "^0.2.0"
+    htmlparser2 "^9.1.0"
+    parse5 "^7.1.2"
+    parse5-htmlparser2-tree-adapter "^7.0.0"
+    parse5-parser-stream "^7.1.2"
+    undici "^6.19.5"
+    whatwg-mimetype "^4.0.0"
+
 chokidar@^3.5.2:
   version "3.6.0"
   resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.6.0.tgz#197c6cc669ef2a8dc5e7b4d97ee4e092c3eb0d5b"
@@ -2963,6 +2992,15 @@ domutils@^3.0.1:
     domelementtype "^2.3.0"
     domhandler "^5.0.3"
 
+domutils@^3.1.0:
+  version "3.2.2"
+  resolved "https://registry.yarnpkg.com/domutils/-/domutils-3.2.2.tgz#edbfe2b668b0c1d97c24baf0f1062b132221bc78"
+  integrity sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==
+  dependencies:
+    dom-serializer "^2.0.0"
+    domelementtype "^2.3.0"
+    domhandler "^5.0.3"
+
 dotenv@^16.0.3:
   version "16.4.5"
   resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-16.4.5.tgz#cdd3b3b604cb327e286b4762e13502f717cb099f"
@@ -3012,6 +3050,14 @@ encodeurl@~1.0.2:
   resolved "https://registry.yarnpkg.com/encodeurl/-/encodeurl-1.0.2.tgz#ad3ff4c86ec2d029322f5a02c3a9a606c95b3f59"
   integrity sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==
 
+encoding-sniffer@^0.2.0:
+  version "0.2.0"
+  resolved "https://registry.yarnpkg.com/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz#799569d66d443babe82af18c9f403498365ef1d5"
+  integrity sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==
+  dependencies:
+    iconv-lite "^0.6.3"
+    whatwg-encoding "^3.1.1"
+
 encoding@^0.1.13:
   version "0.1.13"
   resolved "https://registry.yarnpkg.com/encoding/-/encoding-0.1.13.tgz#56574afdd791f54a8e9b2785c0582a2d26210fa9"
@@ -3026,7 +3072,7 @@ end-of-stream@^1.1.0, end-of-stream@^1.4.1:
   dependencies:
     once "^1.4.0"
 
-entities@^4.2.0:
+entities@^4.2.0, entities@^4.5.0:
   version "4.5.0"
   resolved "https://registry.yarnpkg.com/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48"
   integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==
@@ -3882,6 +3928,16 @@ hermes-parser@0.20.1:
   dependencies:
     hermes-estree "0.20.1"
 
+htmlparser2@^9.1.0:
+  version "9.1.0"
+  resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-9.1.0.tgz#cdb498d8a75a51f739b61d3f718136c369bc8c23"
+  integrity sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==
+  dependencies:
+    domelementtype "^2.3.0"
+    domhandler "^5.0.3"
+    domutils "^3.1.0"
+    entities "^4.5.0"
+
 http-errors@2.0.0:
   version "2.0.0"
   resolved "https://registry.yarnpkg.com/http-errors/-/http-errors-2.0.0.tgz#b7774a1486ef73cf7667ac9ae0858c012c57b9d3"
@@ -3943,7 +3999,7 @@ iconv-lite@0.4.24, iconv-lite@^0.4.24:
   dependencies:
     safer-buffer ">= 2.1.2 < 3"
 
-iconv-lite@^0.6.2, iconv-lite@^0.6.3:
+iconv-lite@0.6.3, iconv-lite@^0.6.2, iconv-lite@^0.6.3:
   version "0.6.3"
   resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501"
   integrity sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==
@@ -5342,6 +5398,28 @@ parent-module@^1.0.0:
   dependencies:
     callsites "^3.0.0"
 
+parse5-htmlparser2-tree-adapter@^7.0.0:
+  version "7.1.0"
+  resolved "https://registry.yarnpkg.com/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz#b5a806548ed893a43e24ccb42fbb78069311e81b"
+  integrity sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==
+  dependencies:
+    domhandler "^5.0.3"
+    parse5 "^7.0.0"
+
+parse5-parser-stream@^7.1.2:
+  version "7.1.2"
+  resolved "https://registry.yarnpkg.com/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz#d7c20eadc37968d272e2c02660fff92dd27e60e1"
+  integrity sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==
+  dependencies:
+    parse5 "^7.0.0"
+
+parse5@^7.0.0, parse5@^7.1.2:
+  version "7.2.1"
+  resolved "https://registry.yarnpkg.com/parse5/-/parse5-7.2.1.tgz#8928f55915e6125f430cc44309765bf17556a33a"
+  integrity sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==
+  dependencies:
+    entities "^4.5.0"
+
 parseurl@~1.3.3:
   version "1.3.3"
   resolved "https://registry.yarnpkg.com/parseurl/-/parseurl-1.3.3.tgz#9da19e7bee8d12dff0513ed5b76957793bc2e8d4"
@@ -6482,6 +6560,11 @@ undici-types@~5.26.4:
   resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.26.5.tgz#bcd539893d00b56e964fd2657a4866b221a65617"
   integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==
 
+undici@^6.19.5:
+  version "6.21.1"
+  resolved "https://registry.yarnpkg.com/undici/-/undici-6.21.1.tgz#336025a14162e6837e44ad7b819b35b6c6af0e05"
+  integrity sha512-q/1rj5D0/zayJB2FraXdaWxbhWiNKDvu8naDT2dl1yTlvJp4BLtOcp2a5BvgGNQpYYJzau7tf1WgKv3b+7mqpQ==
+
 undici@~5.28.4:
   version "5.28.4"
   resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.4.tgz#6b280408edb6a1a604a9b20340f45b422e373068"
@@ -6580,11 +6663,23 @@ webidl-conversions@^3.0.0:
   resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871"
   integrity sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==
 
+whatwg-encoding@^3.1.1:
+  version "3.1.1"
+  resolved "https://registry.yarnpkg.com/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz#d0f4ef769905d426e1688f3e34381a99b60b76e5"
+  integrity sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==
+  dependencies:
+    iconv-lite "0.6.3"
+
 whatwg-fetch@^3.4.1, whatwg-fetch@^3.6.20:
   version "3.6.20"
   resolved "https://registry.yarnpkg.com/whatwg-fetch/-/whatwg-fetch-3.6.20.tgz#580ce6d791facec91d37c72890995a0b48d31c70"
   integrity sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==
 
+whatwg-mimetype@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.yarnpkg.com/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz#bc1bf94a985dc50388d54a9258ac405c3ca2fc0a"
+  integrity sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==
+
 whatwg-url@^5.0.0:
   version "5.0.0"
   resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d"