mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-17 18:18:11 +00:00
[FIX] Bulk link scraper bug fix (#1800)
patch website depth data connector to work for other links that are not root url
This commit is contained in:
parent
b105c6c969
commit
fc375f4036
1 changed files with 7 additions and 3 deletions
|
@ -10,7 +10,7 @@ const path = require("path");
|
|||
const fs = require("fs");
|
||||
|
||||
async function discoverLinks(startUrl, depth = 1, maxLinks = 20) {
|
||||
const baseUrl = new URL(startUrl).origin;
|
||||
const baseUrl = new URL(startUrl);
|
||||
const discoveredLinks = new Set();
|
||||
const pendingLinks = [startUrl];
|
||||
let currentLevel = 0;
|
||||
|
@ -66,8 +66,12 @@ function extractLinks(html, baseUrl) {
|
|||
for (const link of links) {
|
||||
const href = link.getAttribute("href");
|
||||
if (href) {
|
||||
const absoluteUrl = new URL(href, baseUrl).href;
|
||||
if (absoluteUrl.startsWith(baseUrl)) {
|
||||
const absoluteUrl = new URL(href, baseUrl.href).href;
|
||||
if (
|
||||
absoluteUrl.startsWith(
|
||||
baseUrl.origin + baseUrl.pathname.split("/").slice(0, -1).join("/")
|
||||
)
|
||||
) {
|
||||
extractedLinks.add(absoluteUrl);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue