From fc375f40367972ce7a52dbe951174b61402613ca Mon Sep 17 00:00:00 2001 From: Sean Hatfield <seanhatfield5@gmail.com> Date: Mon, 1 Jul 2024 16:59:28 -0700 Subject: [PATCH] [FIX] Bulk link scraper bug fix (#1800) patch website depth data connector to work for other links that are not root url --- collector/utils/extensions/WebsiteDepth/index.js | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index 7cbb04e57..0b76ab914 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -10,7 +10,7 @@ const path = require("path"); const fs = require("fs"); async function discoverLinks(startUrl, depth = 1, maxLinks = 20) { - const baseUrl = new URL(startUrl).origin; + const baseUrl = new URL(startUrl); const discoveredLinks = new Set(); const pendingLinks = [startUrl]; let currentLevel = 0; @@ -66,8 +66,12 @@ function extractLinks(html, baseUrl) { for (const link of links) { const href = link.getAttribute("href"); if (href) { - const absoluteUrl = new URL(href, baseUrl).href; - if (absoluteUrl.startsWith(baseUrl)) { + const absoluteUrl = new URL(href, baseUrl.href).href; + if ( + absoluteUrl.startsWith( + baseUrl.origin + baseUrl.pathname.split("/").slice(0, -1).join("/") + ) + ) { extractedLinks.add(absoluteUrl); } }