#!/usr/bin/env python3 """ Usage: findRSS <start_url> --max_depth <depth> Example: findRSS https://example.com --max_depth 2 """ import argparse import requests from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from collections import deque def find_feeds_on_page(url): """ Scan a single webpage for RSS/Atom link tags. Returns a set of discovered feed URLs. """ feeds = set() try: response = requests.get(url, timeout=10) response.raise_for_status() except (requests.RequestException, ValueError) as e: print(f"Failed to fetch {url}: {e}") return feeds soup = BeautifulSoup(response.text, "html.parser") # Search for <link> tags with known feed types for link_tag in soup.find_all("link"): link_type = link_tag.get("type", "") # rel can be None, string, or a list of strings rel_attrs = link_tag.get("rel", []) # Convert rel attributes to a single string for "feed" checks if isinstance(rel_attrs, list): rel_text = " ".join(rel_attrs).lower() else: rel_text = str(rel_attrs).lower() href = link_tag.get("href") if href and ( "rss" in link_type.lower() or "atom" in link_type.lower() or "feed" in rel_text ): feed_url = urljoin(url, href) feeds.add(feed_url) return feeds def find_links_on_page(url, base_domain): """ Find all URLs under the same domain from a webpage. Returns a set of discovered links. """ links = set() try: response = requests.get(url, timeout=10) response.raise_for_status() except (requests.RequestException, ValueError) as e: # If there's an error getting the page, return an empty set return links soup = BeautifulSoup(response.text, "html.parser") # Search for <a> tags for a_tag in soup.find_all("a", href=True): link = a_tag["href"] full_link = urljoin(url, link) # Only add links that are within the same domain if urlparse(full_link).netloc == base_domain: links.add(full_link) return links def crawl_for_feeds(start_url, max_depth): """ Crawl the given site up to `max_depth` pages deep, collecting RSS/Atom feeds. """ base_domain = urlparse(start_url).netloc # Each item in the queue is a tuple: (url, depth) queue = deque([(start_url, 0)]) visited = set([start_url]) feeds_found = set() while queue: current_url, depth = queue.popleft() # Find feeds on the current page page_feeds = find_feeds_on_page(current_url) feeds_found.update(page_feeds) # If we haven't reached the max depth, gather more pages if depth < max_depth: links = find_links_on_page(current_url, base_domain) for link in links: if link not in visited: visited.add(link) queue.append((link, depth + 1)) return feeds_found def main(): parser = argparse.ArgumentParser( description="Crawl a website for RSS/Atom feeds up to a given depth." ) parser.add_argument("start_url", help="The starting URL to begin crawling.") parser.add_argument( "--max_depth", type=int, default=1, help="How many pages deep to crawl from the original page (default is 1)." ) args = parser.parse_args() feeds = crawl_for_feeds(args.start_url, args.max_depth) if feeds: print("Found the following feeds:") for feed in feeds: print(f" - {feed}") else: print("No feeds found.") if __name__ == "__main__": main()