mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-17 18:18:11 +00:00
Fix scraping failed bug in link/bulk link scrapers (#2807)
* fix scraping failed bug in link/bulk link scrapers * reset submodule * swap to networkidle2 as a safe mix for SPA and API-loaded pages, but also not hang on request heavy pages * lint --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
6bc21860e4
commit
9bc01afa7d
2 changed files with 3 additions and 3 deletions
collector
|
@ -61,7 +61,7 @@ async function getPageContent(link) {
|
|||
ignoreHTTPSErrors: true,
|
||||
},
|
||||
gotoOptions: {
|
||||
waitUntil: "domcontentloaded",
|
||||
waitUntil: "networkidle2",
|
||||
},
|
||||
async evaluate(page, browser) {
|
||||
const result = await page.evaluate(() => document.body.innerText);
|
||||
|
|
|
@ -48,7 +48,7 @@ async function getPageLinks(url, baseUrl) {
|
|||
try {
|
||||
const loader = new PuppeteerWebBaseLoader(url, {
|
||||
launchOptions: { headless: "new" },
|
||||
gotoOptions: { waitUntil: "domcontentloaded" },
|
||||
gotoOptions: { waitUntil: "networkidle2" },
|
||||
});
|
||||
const docs = await loader.load();
|
||||
const html = docs[0].pageContent;
|
||||
|
@ -92,7 +92,7 @@ async function bulkScrapePages(links, outFolderPath) {
|
|||
try {
|
||||
const loader = new PuppeteerWebBaseLoader(link, {
|
||||
launchOptions: { headless: "new" },
|
||||
gotoOptions: { waitUntil: "domcontentloaded" },
|
||||
gotoOptions: { waitUntil: "networkidle2" },
|
||||
async evaluate(page, browser) {
|
||||
const result = await page.evaluate(() => document.body.innerText);
|
||||
await browser.close();
|
||||
|
|
Loading…
Add table
Reference in a new issue