From fc375f40367972ce7a52dbe951174b61402613ca Mon Sep 17 00:00:00 2001
From: Sean Hatfield <seanhatfield5@gmail.com>
Date: Mon, 1 Jul 2024 16:59:28 -0700
Subject: [PATCH] [FIX] Bulk link scraper bug fix (#1800)

patch website depth data connector to work for other links that are not root url
---
 collector/utils/extensions/WebsiteDepth/index.js | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js
index 7cbb04e57..0b76ab914 100644
--- a/collector/utils/extensions/WebsiteDepth/index.js
+++ b/collector/utils/extensions/WebsiteDepth/index.js
@@ -10,7 +10,7 @@ const path = require("path");
 const fs = require("fs");
 
 async function discoverLinks(startUrl, depth = 1, maxLinks = 20) {
-  const baseUrl = new URL(startUrl).origin;
+  const baseUrl = new URL(startUrl);
   const discoveredLinks = new Set();
   const pendingLinks = [startUrl];
   let currentLevel = 0;
@@ -66,8 +66,12 @@ function extractLinks(html, baseUrl) {
   for (const link of links) {
     const href = link.getAttribute("href");
     if (href) {
-      const absoluteUrl = new URL(href, baseUrl).href;
-      if (absoluteUrl.startsWith(baseUrl)) {
+      const absoluteUrl = new URL(href, baseUrl.href).href;
+      if (
+        absoluteUrl.startsWith(
+          baseUrl.origin + baseUrl.pathname.split("/").slice(0, -1).join("/")
+        )
+      ) {
         extractedLinks.add(absoluteUrl);
       }
     }