diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index 0b76ab914..d00718129 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -9,34 +9,36 @@ const { tokenizeString } = require("../../tokenizer"); const path = require("path"); const fs = require("fs"); -async function discoverLinks(startUrl, depth = 1, maxLinks = 20) { +async function discoverLinks(startUrl, maxDepth = 1, maxLinks = 20) { const baseUrl = new URL(startUrl); - const discoveredLinks = new Set(); - const pendingLinks = [startUrl]; - let currentLevel = 0; - depth = depth < 1 ? 1 : depth; - maxLinks = maxLinks < 1 ? 1 : maxLinks; + const discoveredLinks = new Set([startUrl]); + let queue = [[startUrl, 0]]; // [url, currentDepth] + const scrapedUrls = new Set(); - // Check depth and if there are any links left to scrape - while (currentLevel < depth && pendingLinks.length > 0) { - const newLinks = await getPageLinks(pendingLinks[0], baseUrl); - pendingLinks.shift(); + for (let currentDepth = 0; currentDepth < maxDepth; currentDepth++) { + const levelSize = queue.length; + const nextQueue = []; - for (const link of newLinks) { - if (!discoveredLinks.has(link)) { - discoveredLinks.add(link); - pendingLinks.push(link); - } + for (let i = 0; i < levelSize && discoveredLinks.size < maxLinks; i++) { + const [currentUrl, urlDepth] = queue[i]; - // Exit out if we reach maxLinks - if (discoveredLinks.size >= maxLinks) { - return Array.from(discoveredLinks).slice(0, maxLinks); + if (!scrapedUrls.has(currentUrl)) { + scrapedUrls.add(currentUrl); + const newLinks = await getPageLinks(currentUrl, baseUrl); + + for (const link of newLinks) { + if (!discoveredLinks.has(link) && discoveredLinks.size < maxLinks) { + discoveredLinks.add(link); + if (urlDepth + 1 < maxDepth) { + nextQueue.push([link, urlDepth + 1]); + } + } + } } } - if (pendingLinks.length === 0) { - currentLevel++; - } + queue = nextQueue; + if (queue.length === 0 || discoveredLinks.size >= maxLinks) break; } return Array.from(discoveredLinks);