Auto-update: Fri Mar 28 09:42:10 PDT 2025

2025-03-28 09:42:10 -07:00 · 2025-03-28 09:42:10 -07:00 · cfc746242e
commit cfc746242e
parent 0c0fd27fe7
3 changed files with 243 additions and 12 deletions
--- a/134
+++ b/134
@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+"""
+Usage:
+  findRSS <start_url> --max_depth <depth>
+
+Example:
+  findRSS https://example.com --max_depth 2
+"""
+
+import argparse
+import requests
+from urllib.parse import urljoin, urlparse
+from bs4 import BeautifulSoup
+from collections import deque
+
+def find_feeds_on_page(url):
+    """
+    Scan a single webpage for RSS/Atom link tags.
+    Returns a set of discovered feed URLs.
+    """
+    feeds = set()
+
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+    except (requests.RequestException, ValueError) as e:
+        print(f"Failed to fetch {url}: {e}")
+        return feeds
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Search for <link> tags with known feed types
+    for link_tag in soup.find_all("link"):
+        link_type = link_tag.get("type", "")
+        # rel can be None, string, or a list of strings
+        rel_attrs = link_tag.get("rel", [])
+
+        # Convert rel attributes to a single string for "feed" checks
+        if isinstance(rel_attrs, list):
+            rel_text = " ".join(rel_attrs).lower()
+        else:
+            rel_text = str(rel_attrs).lower()
+
+        href = link_tag.get("href")
+
+        if href and (
+            "rss" in link_type.lower()
+            or "atom" in link_type.lower()
+            or "feed" in rel_text
+        ):
+            feed_url = urljoin(url, href)
+            feeds.add(feed_url)
+
+    return feeds
+
+def find_links_on_page(url, base_domain):
+    """
+    Find all URLs under the same domain from a webpage.
+    Returns a set of discovered links.
+    """
+    links = set()
+
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+    except (requests.RequestException, ValueError) as e:
+        # If there's an error getting the page, return an empty set
+        return links
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Search for <a> tags
+    for a_tag in soup.find_all("a", href=True):
+        link = a_tag["href"]
+        full_link = urljoin(url, link)
+        # Only add links that are within the same domain
+        if urlparse(full_link).netloc == base_domain:
+            links.add(full_link)
+
+    return links
+
+def crawl_for_feeds(start_url, max_depth):
+    """
+    Crawl the given site up to `max_depth` pages deep, collecting RSS/Atom feeds.
+    """
+    base_domain = urlparse(start_url).netloc
+
+    # Each item in the queue is a tuple: (url, depth)
+    queue = deque([(start_url, 0)])
+    visited = set([start_url])
+    feeds_found = set()
+
+    while queue:
+        current_url, depth = queue.popleft()
+
+        # Find feeds on the current page
+        page_feeds = find_feeds_on_page(current_url)
+        feeds_found.update(page_feeds)
+
+        # If we haven't reached the max depth, gather more pages
+        if depth < max_depth:
+            links = find_links_on_page(current_url, base_domain)
+            for link in links:
+                if link not in visited:
+                    visited.add(link)
+                    queue.append((link, depth + 1))
+
+    return feeds_found
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Crawl a website for RSS/Atom feeds up to a given depth."
+    )
+    parser.add_argument("start_url", help="The starting URL to begin crawling.")
+    parser.add_argument(
+        "--max_depth",
+        type=int,
+        default=1,
+        help="How many pages deep to crawl from the original page (default is 1)."
+    )
+    args = parser.parse_args()
+
+    feeds = crawl_for_feeds(args.start_url, args.max_depth)
+    if feeds:
+        print("Found the following feeds:")
+        for feed in feeds:
+            print(f"  - {feed}")
+    else:
+        print("No feeds found.")
+
+if __name__ == "__main__":
+    main()
+
--- a/41
+++ b/41
@ -3,17 +3,33 @@
 import sys
 import asyncio
 import trafilatura
+import requests
 from newspaper import Article
 from urllib.parse import urlparse
 from datetime import datetime
-import math
 from typing import Optional
-import textwrap
 import re

+def fetch_url_with_headers(url: str) -> Optional[str]:
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                      'AppleWebKit/537.36 (KHTML, like Gecko) '
+                      'Chrome/115.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Referer': 'https://www.google.com/'
+    }
+    try:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        print(f"Error fetching URL with headers: {str(e)}", file=sys.stderr)
+    return None
+
 async def fetch_and_parse_article(url: str):
-    # Try trafilatura first
-    source = trafilatura.fetch_url(url)
+    # First try using trafilatura with custom headers
+    source = fetch_url_with_headers(url)
    
    if source:
        try:
@ -23,12 +39,12 @@ async def fetch_and_parse_article(url: str):
            article.set_html(source)
            article.parse()
            
-            # Extract text with trafilatura but without markdown to avoid its line breaking
+            # Extract text with trafilatura without markdown to avoid its line breaking
            raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
            if raw_text:
                article.text = raw_text
            
-            # Update other article properties with trafilatura data
+            # Update article properties using trafilatura data
            article.title = article.title or traf.title or url
            article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
            article.publish_date = traf.date or datetime.now()
@ -38,13 +54,16 @@ async def fetch_and_parse_article(url: str):
            return article
        except Exception as e:
            print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)
-            pass
    
-    # Fallback to newspaper3k
+    # Fallback to newspaper3k with headers
    try:
        headers = {
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                          'AppleWebKit/537.36 (KHTML, like Gecko) '
+                          'Chrome/115.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Referer': 'https://www.google.com/'
        }
        
        article = Article(url)
@ -86,10 +105,7 @@ def format_article_markdown(article) -> str:
    
    # Format article text with improved paragraph handling
    if article.text:
-        # Clean and normalize the text first
        clean_content = clean_text(article.text)
-        
-        # Just split on double newlines and don't use textwrap
        output += clean_content
    
    return output
@ -110,3 +126,4 @@ async def main():

 if __name__ == "__main__":
    asyncio.run(main())
+
--- a/80
+++ b/80
@ -0,0 +1,80 @@
+#!/bin/bash
+
+# -------------------------------------------------
+# secure_delete.sh
+# Description:
+#   This script securely deletes a file or all files within a directory.
+#   Usage:
+#     ./secure_delete.sh /path/to/file_or_directory
+# -------------------------------------------------
+
+# Function to display usage information
+usage() {
+    echo "Usage: $0 /path/to/file_or_directory"
+    exit 1
+}
+
+# Check if exactly one argument is provided
+if [ "$#" -ne 1 ]; then
+    echo "Error: Exactly one path must be provided."
+    usage
+fi
+
+TARGET_PATH="$1"
+
+# Check if the path exists
+if [ ! -e "$TARGET_PATH" ]; then
+    echo "Error: The path '$TARGET_PATH' does not exist."
+    exit 1
+fi
+
+# Function to shred a single file with specified options
+shred_file() {
+    local file="$1"
+    echo "Shredding file: $file"
+    shred -f -v -z -n 9 "$file"
+    
+    if [ $? -eq 0 ]; then
+        echo "Successfully shredded: $file"
+    else
+        echo "Failed to shred: $file" >&2
+    fi
+}
+
+# Determine if the path is a file or directory
+if [ -f "$TARGET_PATH" ]; then
+    # It's a regular file
+    shred_file "$TARGET_PATH"
+
+elif [ -d "$TARGET_PATH" ]; then
+    # It's a directory
+    echo "Detected directory: $TARGET_PATH"
+    echo "Shredding all files within the directory..."
+
+    # Find and shred all regular files within the directory
+    find "$TARGET_PATH" -type f -print0 | while IFS= read -r -d '' file; do
+        shred_file "$file"
+    done
+
+    echo "All files within '$TARGET_PATH' have been shredded."
+
+    # Remove the now-empty directory structure
+    echo "Removing directory: $TARGET_PATH"
+    rm -rf "$TARGET_PATH"
+    
+    if [ $? -eq 0 ]; then
+        echo "Directory '$TARGET_PATH' has been removed."
+    else
+        echo "Failed to remove directory: $TARGET_PATH" >&2
+    fi
+
+else
+    # Neither a regular file nor a directory
+    echo "Error: The path '$TARGET_PATH' is neither a regular file nor a directory."
+    exit 1
+fi
+
+echo "Secure deletion completed successfully."
+
+exit 0
+