From e7ba02849790fe7baa428d61031e6f43646f7995 Mon Sep 17 00:00:00 2001 From: AntonioCiolino <antonio.ciolino@gmail.com> Date: Fri, 16 Jun 2023 20:29:11 -0400 Subject: [PATCH] Enable web scraping based on a urtl and a simple filter. (#73) --- collector/main.py | 6 +++++- collector/scripts/link.py | 26 +++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/collector/main.py b/collector/main.py index cd800eb9e..bb0054b92 100644 --- a/collector/main.py +++ b/collector/main.py @@ -1,7 +1,7 @@ import os from InquirerPy import inquirer from scripts.youtube import youtube -from scripts.link import link, links +from scripts.link import link, links, crawler from scripts.substack import substack from scripts.medium import medium from scripts.gitbook import gitbook @@ -42,6 +42,7 @@ def main(): choices=[ {"name": "Single URL", "value": "Single URL"}, {"name": "Multiple URLs", "value": "Multiple URLs"}, + {"name": "URL Crawler", "value": "URL Crawler"}, {"name": "Abort", "value": "Abort"}, ], ).execute() @@ -51,6 +52,9 @@ def main(): if method == 'Multiple URLs': links() exit(0) + if method == 'URL Crawler': + crawler() + exit(0) if method == 'Abort': exit(0) if method == 'YouTube Channel': diff --git a/collector/scripts/link.py b/collector/scripts/link.py index 8bcc02e0e..17a532cb0 100644 --- a/collector/scripts/link.py +++ b/collector/scripts/link.py @@ -4,6 +4,8 @@ from requests_html import HTMLSession from langchain.document_loaders import UnstructuredHTMLLoader from .link_utils import append_meta from .utils import tokenize, ada_v2_cost +import requests +from bs4 import BeautifulSoup # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/ def link(): @@ -64,6 +66,29 @@ def link(): print(f"////////////////////////////") exit(0) +def crawler(): + prompt = "Paste in root URI of the pages of interest: " + new_link = input(prompt) + filter_value = input("Add a filter value for the url to ensure links don't wander too far: ") + #extract this from the uri provided + root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname + links = [] + urls = new_link + links.append(new_link) + grab = requests.get(urls) + soup = BeautifulSoup(grab.text, 'html.parser') + + # traverse paragraphs from soup + for link in soup.find_all("a"): + data = link.get('href').strip() + if filter_value in data: + print (data) + links.append(root_site + data) + else: + print (data + " does not apply for linking...") + #parse the links found + parse_links(links) + def links(): links = [] prompt = "Paste in the URL of an online article or blog: " @@ -86,7 +111,6 @@ def links(): parse_links(links) - # parse links from array def parse_links(links): totalTokens = 0