diff --git a/findRSS b/findRSS new file mode 100755 index 0000000..6a41f29 --- /dev/null +++ b/findRSS @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +""" +Usage: + findRSS <start_url> --max_depth <depth> + +Example: + findRSS https://example.com --max_depth 2 +""" + +import argparse +import requests +from urllib.parse import urljoin, urlparse +from bs4 import BeautifulSoup +from collections import deque + +def find_feeds_on_page(url): + """ + Scan a single webpage for RSS/Atom link tags. + Returns a set of discovered feed URLs. + """ + feeds = set() + + try: + response = requests.get(url, timeout=10) + response.raise_for_status() + except (requests.RequestException, ValueError) as e: + print(f"Failed to fetch {url}: {e}") + return feeds + + soup = BeautifulSoup(response.text, "html.parser") + + # Search for <link> tags with known feed types + for link_tag in soup.find_all("link"): + link_type = link_tag.get("type", "") + # rel can be None, string, or a list of strings + rel_attrs = link_tag.get("rel", []) + + # Convert rel attributes to a single string for "feed" checks + if isinstance(rel_attrs, list): + rel_text = " ".join(rel_attrs).lower() + else: + rel_text = str(rel_attrs).lower() + + href = link_tag.get("href") + + if href and ( + "rss" in link_type.lower() + or "atom" in link_type.lower() + or "feed" in rel_text + ): + feed_url = urljoin(url, href) + feeds.add(feed_url) + + return feeds + +def find_links_on_page(url, base_domain): + """ + Find all URLs under the same domain from a webpage. + Returns a set of discovered links. + """ + links = set() + + try: + response = requests.get(url, timeout=10) + response.raise_for_status() + except (requests.RequestException, ValueError) as e: + # If there's an error getting the page, return an empty set + return links + + soup = BeautifulSoup(response.text, "html.parser") + + # Search for <a> tags + for a_tag in soup.find_all("a", href=True): + link = a_tag["href"] + full_link = urljoin(url, link) + # Only add links that are within the same domain + if urlparse(full_link).netloc == base_domain: + links.add(full_link) + + return links + +def crawl_for_feeds(start_url, max_depth): + """ + Crawl the given site up to `max_depth` pages deep, collecting RSS/Atom feeds. + """ + base_domain = urlparse(start_url).netloc + + # Each item in the queue is a tuple: (url, depth) + queue = deque([(start_url, 0)]) + visited = set([start_url]) + feeds_found = set() + + while queue: + current_url, depth = queue.popleft() + + # Find feeds on the current page + page_feeds = find_feeds_on_page(current_url) + feeds_found.update(page_feeds) + + # If we haven't reached the max depth, gather more pages + if depth < max_depth: + links = find_links_on_page(current_url, base_domain) + for link in links: + if link not in visited: + visited.add(link) + queue.append((link, depth + 1)) + + return feeds_found + +def main(): + parser = argparse.ArgumentParser( + description="Crawl a website for RSS/Atom feeds up to a given depth." + ) + parser.add_argument("start_url", help="The starting URL to begin crawling.") + parser.add_argument( + "--max_depth", + type=int, + default=1, + help="How many pages deep to crawl from the original page (default is 1)." + ) + args = parser.parse_args() + + feeds = crawl_for_feeds(args.start_url, args.max_depth) + if feeds: + print("Found the following feeds:") + for feed in feeds: + print(f" - {feed}") + else: + print("No feeds found.") + +if __name__ == "__main__": + main() + diff --git a/n3k b/n3k index d70c979..d9bb213 100755 --- a/n3k +++ b/n3k @@ -3,17 +3,33 @@ import sys import asyncio import trafilatura +import requests from newspaper import Article from urllib.parse import urlparse from datetime import datetime -import math from typing import Optional -import textwrap import re +def fetch_url_with_headers(url: str) -> Optional[str]: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/115.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Referer': 'https://www.google.com/' + } + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + return response.text + except Exception as e: + print(f"Error fetching URL with headers: {str(e)}", file=sys.stderr) + return None + async def fetch_and_parse_article(url: str): - # Try trafilatura first - source = trafilatura.fetch_url(url) + # First try using trafilatura with custom headers + source = fetch_url_with_headers(url) if source: try: @@ -23,12 +39,12 @@ async def fetch_and_parse_article(url: str): article.set_html(source) article.parse() - # Extract text with trafilatura but without markdown to avoid its line breaking + # Extract text with trafilatura without markdown to avoid its line breaking raw_text = trafilatura.extract(source, output_format="text", include_comments=False) if raw_text: article.text = raw_text - # Update other article properties with trafilatura data + # Update article properties using trafilatura data article.title = article.title or traf.title or url article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author]) article.publish_date = traf.date or datetime.now() @@ -38,13 +54,16 @@ async def fetch_and_parse_article(url: str): return article except Exception as e: print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr) - pass - # Fallback to newspaper3k + # Fallback to newspaper3k with headers try: headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/115.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Referer': 'https://www.google.com/' } article = Article(url) @@ -86,10 +105,7 @@ def format_article_markdown(article) -> str: # Format article text with improved paragraph handling if article.text: - # Clean and normalize the text first clean_content = clean_text(article.text) - - # Just split on double newlines and don't use textwrap output += clean_content return output @@ -110,3 +126,4 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) + diff --git a/shredr b/shredr new file mode 100755 index 0000000..c57bafd --- /dev/null +++ b/shredr @@ -0,0 +1,80 @@ +#!/bin/bash + +# ------------------------------------------------- +# secure_delete.sh +# Description: +# This script securely deletes a file or all files within a directory. +# Usage: +# ./secure_delete.sh /path/to/file_or_directory +# ------------------------------------------------- + +# Function to display usage information +usage() { + echo "Usage: $0 /path/to/file_or_directory" + exit 1 +} + +# Check if exactly one argument is provided +if [ "$#" -ne 1 ]; then + echo "Error: Exactly one path must be provided." + usage +fi + +TARGET_PATH="$1" + +# Check if the path exists +if [ ! -e "$TARGET_PATH" ]; then + echo "Error: The path '$TARGET_PATH' does not exist." + exit 1 +fi + +# Function to shred a single file with specified options +shred_file() { + local file="$1" + echo "Shredding file: $file" + shred -f -v -z -n 9 "$file" + + if [ $? -eq 0 ]; then + echo "Successfully shredded: $file" + else + echo "Failed to shred: $file" >&2 + fi +} + +# Determine if the path is a file or directory +if [ -f "$TARGET_PATH" ]; then + # It's a regular file + shred_file "$TARGET_PATH" + +elif [ -d "$TARGET_PATH" ]; then + # It's a directory + echo "Detected directory: $TARGET_PATH" + echo "Shredding all files within the directory..." + + # Find and shred all regular files within the directory + find "$TARGET_PATH" -type f -print0 | while IFS= read -r -d '' file; do + shred_file "$file" + done + + echo "All files within '$TARGET_PATH' have been shredded." + + # Remove the now-empty directory structure + echo "Removing directory: $TARGET_PATH" + rm -rf "$TARGET_PATH" + + if [ $? -eq 0 ]; then + echo "Directory '$TARGET_PATH' has been removed." + else + echo "Failed to remove directory: $TARGET_PATH" >&2 + fi + +else + # Neither a regular file nor a directory + echo "Error: The path '$TARGET_PATH' is neither a regular file nor a directory." + exit 1 +fi + +echo "Secure deletion completed successfully." + +exit 0 +