Fix scraping failed bug in link/bulk link scrapers ()

* fix scraping failed bug in link/bulk link scrapers

* reset submodule

* swap to networkidle2 as a safe mix for SPA and API-loaded pages, but also not hang on request heavy pages

* lint

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
Sean Hatfield 2024-12-12 06:01:52 +08:00 committed by GitHub
parent 6bc21860e4
commit 9bc01afa7d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 3 additions and 3 deletions
collector
processLink/convert
utils/extensions/WebsiteDepth

View file

@ -61,7 +61,7 @@ async function getPageContent(link) {
ignoreHTTPSErrors: true,
},
gotoOptions: {
waitUntil: "domcontentloaded",
waitUntil: "networkidle2",
},
async evaluate(page, browser) {
const result = await page.evaluate(() => document.body.innerText);

View file

@ -48,7 +48,7 @@ async function getPageLinks(url, baseUrl) {
try {
const loader = new PuppeteerWebBaseLoader(url, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
gotoOptions: { waitUntil: "networkidle2" },
});
const docs = await loader.load();
const html = docs[0].pageContent;
@ -92,7 +92,7 @@ async function bulkScrapePages(links, outFolderPath) {
try {
const loader = new PuppeteerWebBaseLoader(link, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
gotoOptions: { waitUntil: "networkidle2" },
async evaluate(page, browser) {
const result = await page.evaluate(() => document.body.innerText);
await browser.close();