Fix scraping failed bug in link/bulk link scrapers (#2807)

* fix scraping failed bug in link/bulk link scrapers * reset submodule * swap to networkidle2 as a safe mix for SPA and API-loaded pages, but also not hang on request heavy pages * lint --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-04-17 18:18:11 +00:00 · 2024-12-12 06:01:52 +08:00 · 2024-12-12 06:01:52 +08:00 · 9bc01afa7d
commit 9bc01afa7d
parent 6bc21860e4
2 changed files with 3 additions and 3 deletions
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@ -61,7 +61,7 @@ async function getPageContent(link) {
        ignoreHTTPSErrors: true,
      },
      gotoOptions: {
-        waitUntil: "domcontentloaded",
+        waitUntil: "networkidle2",
      },
      async evaluate(page, browser) {
        const result = await page.evaluate(() => document.body.innerText);
--- a/collector/utils/extensions/WebsiteDepth/index.js
+++ b/collector/utils/extensions/WebsiteDepth/index.js
@ -48,7 +48,7 @@ async function getPageLinks(url, baseUrl) {
  try {
    const loader = new PuppeteerWebBaseLoader(url, {
      launchOptions: { headless: "new" },
-      gotoOptions: { waitUntil: "domcontentloaded" },
+      gotoOptions: { waitUntil: "networkidle2" },
    });
    const docs = await loader.load();
    const html = docs[0].pageContent;
@ -92,7 +92,7 @@ async function bulkScrapePages(links, outFolderPath) {
    try {
      const loader = new PuppeteerWebBaseLoader(link, {
        launchOptions: { headless: "new" },
-        gotoOptions: { waitUntil: "domcontentloaded" },
+        gotoOptions: { waitUntil: "networkidle2" },
        async evaluate(page, browser) {
          const result = await page.evaluate(() => document.body.innerText);
          await browser.close();