Handle non-ascii characters in single and bulk link scraper URLs ()

handle non-ascii characters in urls
This commit is contained in:
Sean Hatfield 2024-10-17 17:04:00 -07:00 committed by GitHub
parent 93d7ce6d34
commit 41522cdfb4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 4 additions and 2 deletions
collector
processLink/convert
utils/extensions/WebsiteDepth

View file

@ -27,7 +27,8 @@ async function scrapeGenericUrl(link, textOnly = false) {
}
const url = new URL(link);
const filename = (url.host + "-" + url.pathname).replace(".", "_");
const decodedPathname = decodeURIComponent(url.pathname);
const filename = `${url.hostname}${decodedPathname.replace(/\//g, '_')}`;
const data = {
id: v4(),

View file

@ -108,7 +108,8 @@ async function bulkScrapePages(links, outFolderPath) {
}
const url = new URL(link);
const filename = (url.host + "-" + url.pathname).replace(".", "_");
const decodedPathname = decodeURIComponent(url.pathname);
const filename = `${url.hostname}${decodedPathname.replace(/\//g, '_')}`;
const data = {
id: v4(),