[FIX] Bulk link scraper bug fix ()

patch website depth data connector to work for other links that are not root url
This commit is contained in:
Sean Hatfield 2024-07-01 16:59:28 -07:00 committed by GitHub
parent b105c6c969
commit fc375f4036
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -10,7 +10,7 @@ const path = require("path");
const fs = require("fs");
async function discoverLinks(startUrl, depth = 1, maxLinks = 20) {
const baseUrl = new URL(startUrl).origin;
const baseUrl = new URL(startUrl);
const discoveredLinks = new Set();
const pendingLinks = [startUrl];
let currentLevel = 0;
@ -66,8 +66,12 @@ function extractLinks(html, baseUrl) {
for (const link of links) {
const href = link.getAttribute("href");
if (href) {
const absoluteUrl = new URL(href, baseUrl).href;
if (absoluteUrl.startsWith(baseUrl)) {
const absoluteUrl = new URL(href, baseUrl.href).href;
if (
absoluteUrl.startsWith(
baseUrl.origin + baseUrl.pathname.split("/").slice(0, -1).join("/")
)
) {
extractedLinks.add(absoluteUrl);
}
}