Auto-update: Fri Mar 28 09:42:10 PDT 2025

This commit is contained in:
sanj 2025-03-28 09:42:10 -07:00
parent 0c0fd27fe7
commit cfc746242e
3 changed files with 243 additions and 12 deletions

134
findRSS Executable file
View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""
Usage:
findRSS <start_url> --max_depth <depth>
Example:
findRSS https://example.com --max_depth 2
"""
import argparse
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from collections import deque
def find_feeds_on_page(url):
"""
Scan a single webpage for RSS/Atom link tags.
Returns a set of discovered feed URLs.
"""
feeds = set()
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except (requests.RequestException, ValueError) as e:
print(f"Failed to fetch {url}: {e}")
return feeds
soup = BeautifulSoup(response.text, "html.parser")
# Search for <link> tags with known feed types
for link_tag in soup.find_all("link"):
link_type = link_tag.get("type", "")
# rel can be None, string, or a list of strings
rel_attrs = link_tag.get("rel", [])
# Convert rel attributes to a single string for "feed" checks
if isinstance(rel_attrs, list):
rel_text = " ".join(rel_attrs).lower()
else:
rel_text = str(rel_attrs).lower()
href = link_tag.get("href")
if href and (
"rss" in link_type.lower()
or "atom" in link_type.lower()
or "feed" in rel_text
):
feed_url = urljoin(url, href)
feeds.add(feed_url)
return feeds
def find_links_on_page(url, base_domain):
"""
Find all URLs under the same domain from a webpage.
Returns a set of discovered links.
"""
links = set()
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except (requests.RequestException, ValueError) as e:
# If there's an error getting the page, return an empty set
return links
soup = BeautifulSoup(response.text, "html.parser")
# Search for <a> tags
for a_tag in soup.find_all("a", href=True):
link = a_tag["href"]
full_link = urljoin(url, link)
# Only add links that are within the same domain
if urlparse(full_link).netloc == base_domain:
links.add(full_link)
return links
def crawl_for_feeds(start_url, max_depth):
"""
Crawl the given site up to `max_depth` pages deep, collecting RSS/Atom feeds.
"""
base_domain = urlparse(start_url).netloc
# Each item in the queue is a tuple: (url, depth)
queue = deque([(start_url, 0)])
visited = set([start_url])
feeds_found = set()
while queue:
current_url, depth = queue.popleft()
# Find feeds on the current page
page_feeds = find_feeds_on_page(current_url)
feeds_found.update(page_feeds)
# If we haven't reached the max depth, gather more pages
if depth < max_depth:
links = find_links_on_page(current_url, base_domain)
for link in links:
if link not in visited:
visited.add(link)
queue.append((link, depth + 1))
return feeds_found
def main():
parser = argparse.ArgumentParser(
description="Crawl a website for RSS/Atom feeds up to a given depth."
)
parser.add_argument("start_url", help="The starting URL to begin crawling.")
parser.add_argument(
"--max_depth",
type=int,
default=1,
help="How many pages deep to crawl from the original page (default is 1)."
)
args = parser.parse_args()
feeds = crawl_for_feeds(args.start_url, args.max_depth)
if feeds:
print("Found the following feeds:")
for feed in feeds:
print(f" - {feed}")
else:
print("No feeds found.")
if __name__ == "__main__":
main()

41
n3k
View file

@ -3,17 +3,33 @@
import sys
import asyncio
import trafilatura
import requests
from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
import math
from typing import Optional
import textwrap
import re
def fetch_url_with_headers(url: str) -> Optional[str]:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/115.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/'
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
print(f"Error fetching URL with headers: {str(e)}", file=sys.stderr)
return None
async def fetch_and_parse_article(url: str):
# Try trafilatura first
source = trafilatura.fetch_url(url)
# First try using trafilatura with custom headers
source = fetch_url_with_headers(url)
if source:
try:
@ -23,12 +39,12 @@ async def fetch_and_parse_article(url: str):
article.set_html(source)
article.parse()
# Extract text with trafilatura but without markdown to avoid its line breaking
# Extract text with trafilatura without markdown to avoid its line breaking
raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
if raw_text:
article.text = raw_text
# Update other article properties with trafilatura data
# Update article properties using trafilatura data
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = traf.date or datetime.now()
@ -38,13 +54,16 @@ async def fetch_and_parse_article(url: str):
return article
except Exception as e:
print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)
pass
# Fallback to newspaper3k
# Fallback to newspaper3k with headers
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/115.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/'
}
article = Article(url)
@ -86,10 +105,7 @@ def format_article_markdown(article) -> str:
# Format article text with improved paragraph handling
if article.text:
# Clean and normalize the text first
clean_content = clean_text(article.text)
# Just split on double newlines and don't use textwrap
output += clean_content
return output
@ -110,3 +126,4 @@ async def main():
if __name__ == "__main__":
asyncio.run(main())

80
shredr Executable file
View file

@ -0,0 +1,80 @@
#!/bin/bash
# -------------------------------------------------
# secure_delete.sh
# Description:
# This script securely deletes a file or all files within a directory.
# Usage:
# ./secure_delete.sh /path/to/file_or_directory
# -------------------------------------------------
# Function to display usage information
usage() {
echo "Usage: $0 /path/to/file_or_directory"
exit 1
}
# Check if exactly one argument is provided
if [ "$#" -ne 1 ]; then
echo "Error: Exactly one path must be provided."
usage
fi
TARGET_PATH="$1"
# Check if the path exists
if [ ! -e "$TARGET_PATH" ]; then
echo "Error: The path '$TARGET_PATH' does not exist."
exit 1
fi
# Function to shred a single file with specified options
shred_file() {
local file="$1"
echo "Shredding file: $file"
shred -f -v -z -n 9 "$file"
if [ $? -eq 0 ]; then
echo "Successfully shredded: $file"
else
echo "Failed to shred: $file" >&2
fi
}
# Determine if the path is a file or directory
if [ -f "$TARGET_PATH" ]; then
# It's a regular file
shred_file "$TARGET_PATH"
elif [ -d "$TARGET_PATH" ]; then
# It's a directory
echo "Detected directory: $TARGET_PATH"
echo "Shredding all files within the directory..."
# Find and shred all regular files within the directory
find "$TARGET_PATH" -type f -print0 | while IFS= read -r -d '' file; do
shred_file "$file"
done
echo "All files within '$TARGET_PATH' have been shredded."
# Remove the now-empty directory structure
echo "Removing directory: $TARGET_PATH"
rm -rf "$TARGET_PATH"
if [ $? -eq 0 ]; then
echo "Directory '$TARGET_PATH' has been removed."
else
echo "Failed to remove directory: $TARGET_PATH" >&2
fi
else
# Neither a regular file nor a directory
echo "Error: The path '$TARGET_PATH' is neither a regular file nor a directory."
exit 1
fi
echo "Secure deletion completed successfully."
exit 0