pathScripts/findRSS

134 lines
3.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Usage:
findRSS <start_url> --max_depth <depth>
Example:
findRSS https://example.com --max_depth 2
"""
import argparse
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from collections import deque
def find_feeds_on_page(url):
"""
Scan a single webpage for RSS/Atom link tags.
Returns a set of discovered feed URLs.
"""
feeds = set()
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except (requests.RequestException, ValueError) as e:
print(f"Failed to fetch {url}: {e}")
return feeds
soup = BeautifulSoup(response.text, "html.parser")
# Search for <link> tags with known feed types
for link_tag in soup.find_all("link"):
link_type = link_tag.get("type", "")
# rel can be None, string, or a list of strings
rel_attrs = link_tag.get("rel", [])
# Convert rel attributes to a single string for "feed" checks
if isinstance(rel_attrs, list):
rel_text = " ".join(rel_attrs).lower()
else:
rel_text = str(rel_attrs).lower()
href = link_tag.get("href")
if href and (
"rss" in link_type.lower()
or "atom" in link_type.lower()
or "feed" in rel_text
):
feed_url = urljoin(url, href)
feeds.add(feed_url)
return feeds
def find_links_on_page(url, base_domain):
"""
Find all URLs under the same domain from a webpage.
Returns a set of discovered links.
"""
links = set()
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except (requests.RequestException, ValueError) as e:
# If there's an error getting the page, return an empty set
return links
soup = BeautifulSoup(response.text, "html.parser")
# Search for <a> tags
for a_tag in soup.find_all("a", href=True):
link = a_tag["href"]
full_link = urljoin(url, link)
# Only add links that are within the same domain
if urlparse(full_link).netloc == base_domain:
links.add(full_link)
return links
def crawl_for_feeds(start_url, max_depth):
"""
Crawl the given site up to `max_depth` pages deep, collecting RSS/Atom feeds.
"""
base_domain = urlparse(start_url).netloc
# Each item in the queue is a tuple: (url, depth)
queue = deque([(start_url, 0)])
visited = set([start_url])
feeds_found = set()
while queue:
current_url, depth = queue.popleft()
# Find feeds on the current page
page_feeds = find_feeds_on_page(current_url)
feeds_found.update(page_feeds)
# If we haven't reached the max depth, gather more pages
if depth < max_depth:
links = find_links_on_page(current_url, base_domain)
for link in links:
if link not in visited:
visited.add(link)
queue.append((link, depth + 1))
return feeds_found
def main():
parser = argparse.ArgumentParser(
description="Crawl a website for RSS/Atom feeds up to a given depth."
)
parser.add_argument("start_url", help="The starting URL to begin crawling.")
parser.add_argument(
"--max_depth",
type=int,
default=1,
help="How many pages deep to crawl from the original page (default is 1)."
)
args = parser.parse_args()
feeds = crawl_for_feeds(args.start_url, args.max_depth)
if feeds:
print("Found the following feeds:")
for feed in feeds:
print(f" - {feed}")
else:
print("No feeds found.")
if __name__ == "__main__":
main()