anything-llm/collector/processLink/convert/generic.js

const { v4 } = require("uuid");
const {
  PuppeteerWebBaseLoader,
} = require("langchain/document_loaders/web/puppeteer");
const { writeToServerDocuments } = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

/**
 * Scrape a generic URL and return the content in the specified format
 * @param {string} link - The URL to scrape
 * @param {('html' | 'text')} captureAs - The format to capture the page content as
 * @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly
 * @returns {Promise<Object>} - The content of the page
 */
async function scrapeGenericUrl(
  link,
  captureAs = "text",
  processAsDocument = true
) {
  console.log(`-- Working URL ${link} => (${captureAs}) --`);
  const content = await getPageContent(link, captureAs);

  if (!content.length) {
    console.error(`Resulting URL content was empty at ${link}.`);
    return {
      success: false,
      reason: `No URL content found at ${link}.`,
      documents: [],
    };
  }

  if (!processAsDocument) {
    return {
      success: true,
      content,
    };
  }

  const url = new URL(link);
  const decodedPathname = decodeURIComponent(url.pathname);
  const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;

  const data = {
    id: v4(),
    url: "file://" + slugify(filename) + ".html",
    title: slugify(filename) + ".html",
    docAuthor: "no author found",
    description: "No description found.",
    docSource: "URL link uploaded by the user.",
    chunkSource: `link://${link}`,
    published: new Date().toLocaleString(),
    wordCount: content.split(" ").length,
    pageContent: content,
    token_count_estimate: tokenizeString(content),
  };

  const document = writeToServerDocuments(
    data,
    `url-${slugify(filename)}-${data.id}`
  );
  console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`);
  return { success: true, reason: null, documents: [document] };
}

/**
 * Get the content of a page
 * @param {string} link - The URL to get the content of
 * @param {('html' | 'text')} captureAs - The format to capture the page content as
 * @returns {Promise<string>} - The content of the page
 */
async function getPageContent(link, captureAs = "text") {
  try {
    let pageContents = [];
    const loader = new PuppeteerWebBaseLoader(link, {
      launchOptions: {
        headless: "new",
        ignoreHTTPSErrors: true,
      },
      gotoOptions: {
        waitUntil: "networkidle2",
      },
      async evaluate(page, browser) {
        const result = await page.evaluate((captureAs) => {
          if (captureAs === "text") return document.body.innerText;
          if (captureAs === "html") return document.documentElement.innerHTML;
          return document.body.innerText;
        }, captureAs);
        await browser.close();
        return result;
      },
    });

    const docs = await loader.load();

    for (const doc of docs) {
      pageContents.push(doc.pageContent);
    }

    return pageContents.join(" ");
  } catch (error) {
    console.error(
      "getPageContent failed to be fetched by puppeteer - falling back to fetch!",
      error
    );
  }

  try {
    const pageText = await fetch(link, {
      method: "GET",
      headers: {
        "Content-Type": "text/plain",
        "User-Agent":
          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)",
      },
    }).then((res) => res.text());
    return pageText;
  } catch (error) {
    console.error("getPageContent failed to be fetched by any method.", error);
  }

  return null;
}

module.exports = {
  scrapeGenericUrl,
};
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`const { v4 } = require("uuid");`
			`const {`
			`PuppeteerWebBaseLoader,`
			`} = require("langchain/document_loaders/web/puppeteer");`
			`const { writeToServerDocuments } = require("../../utils/files");`
			`const { tokenizeString } = require("../../utils/tokenizer");`
			`const { default: slugify } = require("slugify");`

Add `querySelectorAll` capability to web-scraping block (#3186) * Add `querySelectorAll` capability to web-scraping block * patches and fallbacks * fix styles of text in web scraping block --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> 2025-02-13 16:11:15 -08:00			`/**`
			`* Scrape a generic URL and return the content in the specified format`
			`* @param {string} link - The URL to scrape`
			`* @param {('html' \| 'text')} captureAs - The format to capture the page content as`
			`* @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly`
			`* @returns {Promise<Object>} - The content of the page`
			`*/`
			`async function scrapeGenericUrl(`
			`link,`
			`captureAs = "text",`
			`processAsDocument = true`
			`) {`
			console.log(`-- Working URL ${link} => (${captureAs}) --`);
			`const content = await getPageContent(link, captureAs);`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00
			`if (!content.length) {`
			console.error(`Resulting URL content was empty at ${link}.`);
570 document api return object (#608) * Add support for fetching single document in documents folder * Add document object to upload + support link scraping via API * hotfixes for documentation * update api docs 2024-01-16 16:04:22 -08:00			`return {`
			`success: false,`
			reason: `No URL content found at ${link}.`,
			`documents: [],`
			`};`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`}`

Add `querySelectorAll` capability to web-scraping block (#3186) * Add `querySelectorAll` capability to web-scraping block * patches and fallbacks * fix styles of text in web scraping block --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> 2025-02-13 16:11:15 -08:00			`if (!processAsDocument) {`
Agent support for `@agent` default agent inside workspace chat (#1093) V1 of agent support via built-in `@agent` that can be invoked alongside normal workspace RAG chat. 2024-04-16 10:50:10 -07:00			`return {`
			`success: true,`
			`content,`
			`};`
			`}`

Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`const url = new URL(link);`
Handle non-ascii characters in single and bulk link scraper URLs (#2495) handle non-ascii characters in urls 2024-10-17 17:04:00 -07:00			`const decodedPathname = decodeURIComponent(url.pathname);`
linting 2024-10-18 11:44:14 -07:00			const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00
hoist var in extensions 2023-12-20 19:41:16 -08:00			`const data = {`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`id: v4(),`
			`url: "file://" + slugify(filename) + ".html",`
			`title: slugify(filename) + ".html",`
			`docAuthor: "no author found",`
			`description: "No description found.",`
			`docSource: "URL link uploaded by the user.",`
689 links in citation (#715) * Include links in citations force ChunkSource key to retain this information old links will be unsupported * show special icons depending on source * remove console log * reset server documents writeTo 2024-02-13 14:11:57 -08:00			chunkSource: `link://${link}`,
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`published: new Date().toLocaleString(),`
			`wordCount: content.split(" ").length,`
			`pageContent: content,`
Add tokenizer improvments via Singleton class and estimation (#3072) * Add tokenizer improvments via Singleton class linting * dev build * Estimation fallback when string exceeds a fixed byte size * Add notice to tiktoken on backend 2025-01-30 17:55:03 -08:00			`token_count_estimate: tokenizeString(content),`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`};`

570 document api return object (#608) * Add support for fetching single document in documents folder * Add document object to upload + support link scraping via API * hotfixes for documentation * update api docs 2024-01-16 16:04:22 -08:00			`const document = writeToServerDocuments(`
			`data,`
			`url-${slugify(filename)}-${data.id}`
			`);`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`);
570 document api return object (#608) * Add support for fetching single document in documents folder * Add document object to upload + support link scraping via API * hotfixes for documentation * update api docs 2024-01-16 16:04:22 -08:00			`return { success: true, reason: null, documents: [document] };`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`}`

Add `querySelectorAll` capability to web-scraping block (#3186) * Add `querySelectorAll` capability to web-scraping block * patches and fallbacks * fix styles of text in web scraping block --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> 2025-02-13 16:11:15 -08:00			`/**`
			`* Get the content of a page`
			`* @param {string} link - The URL to get the content of`
			`* @param {('html' \| 'text')} captureAs - The format to capture the page content as`
			`* @returns {Promise<string>} - The content of the page`
			`*/`
			`async function getPageContent(link, captureAs = "text") {`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`try {`
			`let pageContents = [];`
			`const loader = new PuppeteerWebBaseLoader(link, {`
			`launchOptions: {`
			`headless: "new",`
Ignore SSL errors for web scraper resolves #2114 2024-08-14 09:11:22 -07:00			`ignoreHTTPSErrors: true,`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`},`
			`gotoOptions: {`
Fix scraping failed bug in link/bulk link scrapers (#2807) * fix scraping failed bug in link/bulk link scrapers * reset submodule * swap to networkidle2 as a safe mix for SPA and API-loaded pages, but also not hang on request heavy pages * lint --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> 2024-12-12 06:01:52 +08:00			`waitUntil: "networkidle2",`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`},`
			`async evaluate(page, browser) {`
Add `querySelectorAll` capability to web-scraping block (#3186) * Add `querySelectorAll` capability to web-scraping block * patches and fallbacks * fix styles of text in web scraping block --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> 2025-02-13 16:11:15 -08:00			`const result = await page.evaluate((captureAs) => {`
			`if (captureAs === "text") return document.body.innerText;`
			`if (captureAs === "html") return document.documentElement.innerHTML;`
			`return document.body.innerText;`
			`}, captureAs);`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`await browser.close();`
			`return result;`
			`},`
			`});`

			`const docs = await loader.load();`

			`for (const doc of docs) {`
			`pageContents.push(doc.pageContent);`
			`}`

			`return pageContents.join(" ");`
			`} catch (error) {`
Agent support for `@agent` default agent inside workspace chat (#1093) V1 of agent support via built-in `@agent` that can be invoked alongside normal workspace RAG chat. 2024-04-16 10:50:10 -07:00			`console.error(`
			`"getPageContent failed to be fetched by puppeteer - falling back to fetch!",`
			`error`
			`);`
			`}`

			`try {`
			`const pageText = await fetch(link, {`
			`method: "GET",`
			`headers: {`
			`"Content-Type": "text/plain",`
			`"User-Agent":`
			`"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)",`
			`},`
			`}).then((res) => res.text());`
			`return pageText;`
			`} catch (error) {`
			`console.error("getPageContent failed to be fetched by any method.", error);`
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`}`
Agent support for `@agent` default agent inside workspace chat (#1093) V1 of agent support via built-in `@agent` that can be invoked alongside normal workspace RAG chat. 2024-04-16 10:50:10 -07:00
Document Processor v2 (#442) * wip: init refactor of document processor to JS * add NodeJs PDF support * wip: partity with python processor feat: add pptx support * fix: forgot files * Remove python scripts totally * wip:update docker to boot new collector * add package.json support * update dockerfile for new build * update gitignore and linting * add more protections on file lookup * update package.json * test build * update docker commands to use cap-add=SYS_ADMIN so web scraper can run update all scripts to reflect this remove docker build for branch 2023-12-14 15:14:56 -08:00			`return null;`
			`}`

			`module.exports = {`
			`scrapeGenericUrl,`
			`};`
No results found.