From 279729850760667b21a62c321b3f29875819599a Mon Sep 17 00:00:00 2001 From: Sean Hatfield <seanhatfield5@gmail.com> Date: Mon, 12 Aug 2024 11:44:35 -0700 Subject: [PATCH] Fix depth handling in bulk link scraper (#2096) fix depth handling in bulk link scraper --- .../utils/extensions/WebsiteDepth/index.js | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index 0b76ab914..d00718129 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -9,34 +9,36 @@ const { tokenizeString } = require("../../tokenizer"); const path = require("path"); const fs = require("fs"); -async function discoverLinks(startUrl, depth = 1, maxLinks = 20) { +async function discoverLinks(startUrl, maxDepth = 1, maxLinks = 20) { const baseUrl = new URL(startUrl); - const discoveredLinks = new Set(); - const pendingLinks = [startUrl]; - let currentLevel = 0; - depth = depth < 1 ? 1 : depth; - maxLinks = maxLinks < 1 ? 1 : maxLinks; + const discoveredLinks = new Set([startUrl]); + let queue = [[startUrl, 0]]; // [url, currentDepth] + const scrapedUrls = new Set(); - // Check depth and if there are any links left to scrape - while (currentLevel < depth && pendingLinks.length > 0) { - const newLinks = await getPageLinks(pendingLinks[0], baseUrl); - pendingLinks.shift(); + for (let currentDepth = 0; currentDepth < maxDepth; currentDepth++) { + const levelSize = queue.length; + const nextQueue = []; - for (const link of newLinks) { - if (!discoveredLinks.has(link)) { - discoveredLinks.add(link); - pendingLinks.push(link); - } + for (let i = 0; i < levelSize && discoveredLinks.size < maxLinks; i++) { + const [currentUrl, urlDepth] = queue[i]; - // Exit out if we reach maxLinks - if (discoveredLinks.size >= maxLinks) { - return Array.from(discoveredLinks).slice(0, maxLinks); + if (!scrapedUrls.has(currentUrl)) { + scrapedUrls.add(currentUrl); + const newLinks = await getPageLinks(currentUrl, baseUrl); + + for (const link of newLinks) { + if (!discoveredLinks.has(link) && discoveredLinks.size < maxLinks) { + discoveredLinks.add(link); + if (urlDepth + 1 < maxDepth) { + nextQueue.push([link, urlDepth + 1]); + } + } + } } } - if (pendingLinks.length === 0) { - currentLevel++; - } + queue = nextQueue; + if (queue.length === 0 || discoveredLinks.size >= maxLinks) break; } return Array.from(discoveredLinks);