Auto-update: Fri Mar 28 09:42:10 PDT 2025
This commit is contained in:
parent
0c0fd27fe7
commit
cfc746242e
3 changed files with 243 additions and 12 deletions
134
findRSS
Executable file
134
findRSS
Executable file
|
@ -0,0 +1,134 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Usage:
|
||||
findRSS <start_url> --max_depth <depth>
|
||||
|
||||
Example:
|
||||
findRSS https://example.com --max_depth 2
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import requests
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from collections import deque
|
||||
|
||||
def find_feeds_on_page(url):
|
||||
"""
|
||||
Scan a single webpage for RSS/Atom link tags.
|
||||
Returns a set of discovered feed URLs.
|
||||
"""
|
||||
feeds = set()
|
||||
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
except (requests.RequestException, ValueError) as e:
|
||||
print(f"Failed to fetch {url}: {e}")
|
||||
return feeds
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Search for <link> tags with known feed types
|
||||
for link_tag in soup.find_all("link"):
|
||||
link_type = link_tag.get("type", "")
|
||||
# rel can be None, string, or a list of strings
|
||||
rel_attrs = link_tag.get("rel", [])
|
||||
|
||||
# Convert rel attributes to a single string for "feed" checks
|
||||
if isinstance(rel_attrs, list):
|
||||
rel_text = " ".join(rel_attrs).lower()
|
||||
else:
|
||||
rel_text = str(rel_attrs).lower()
|
||||
|
||||
href = link_tag.get("href")
|
||||
|
||||
if href and (
|
||||
"rss" in link_type.lower()
|
||||
or "atom" in link_type.lower()
|
||||
or "feed" in rel_text
|
||||
):
|
||||
feed_url = urljoin(url, href)
|
||||
feeds.add(feed_url)
|
||||
|
||||
return feeds
|
||||
|
||||
def find_links_on_page(url, base_domain):
|
||||
"""
|
||||
Find all URLs under the same domain from a webpage.
|
||||
Returns a set of discovered links.
|
||||
"""
|
||||
links = set()
|
||||
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
except (requests.RequestException, ValueError) as e:
|
||||
# If there's an error getting the page, return an empty set
|
||||
return links
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Search for <a> tags
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
link = a_tag["href"]
|
||||
full_link = urljoin(url, link)
|
||||
# Only add links that are within the same domain
|
||||
if urlparse(full_link).netloc == base_domain:
|
||||
links.add(full_link)
|
||||
|
||||
return links
|
||||
|
||||
def crawl_for_feeds(start_url, max_depth):
|
||||
"""
|
||||
Crawl the given site up to `max_depth` pages deep, collecting RSS/Atom feeds.
|
||||
"""
|
||||
base_domain = urlparse(start_url).netloc
|
||||
|
||||
# Each item in the queue is a tuple: (url, depth)
|
||||
queue = deque([(start_url, 0)])
|
||||
visited = set([start_url])
|
||||
feeds_found = set()
|
||||
|
||||
while queue:
|
||||
current_url, depth = queue.popleft()
|
||||
|
||||
# Find feeds on the current page
|
||||
page_feeds = find_feeds_on_page(current_url)
|
||||
feeds_found.update(page_feeds)
|
||||
|
||||
# If we haven't reached the max depth, gather more pages
|
||||
if depth < max_depth:
|
||||
links = find_links_on_page(current_url, base_domain)
|
||||
for link in links:
|
||||
if link not in visited:
|
||||
visited.add(link)
|
||||
queue.append((link, depth + 1))
|
||||
|
||||
return feeds_found
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Crawl a website for RSS/Atom feeds up to a given depth."
|
||||
)
|
||||
parser.add_argument("start_url", help="The starting URL to begin crawling.")
|
||||
parser.add_argument(
|
||||
"--max_depth",
|
||||
type=int,
|
||||
default=1,
|
||||
help="How many pages deep to crawl from the original page (default is 1)."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
feeds = crawl_for_feeds(args.start_url, args.max_depth)
|
||||
if feeds:
|
||||
print("Found the following feeds:")
|
||||
for feed in feeds:
|
||||
print(f" - {feed}")
|
||||
else:
|
||||
print("No feeds found.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
41
n3k
41
n3k
|
@ -3,17 +3,33 @@
|
|||
import sys
|
||||
import asyncio
|
||||
import trafilatura
|
||||
import requests
|
||||
from newspaper import Article
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
import math
|
||||
from typing import Optional
|
||||
import textwrap
|
||||
import re
|
||||
|
||||
def fetch_url_with_headers(url: str) -> Optional[str]:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/115.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Referer': 'https://www.google.com/'
|
||||
}
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except Exception as e:
|
||||
print(f"Error fetching URL with headers: {str(e)}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
async def fetch_and_parse_article(url: str):
|
||||
# Try trafilatura first
|
||||
source = trafilatura.fetch_url(url)
|
||||
# First try using trafilatura with custom headers
|
||||
source = fetch_url_with_headers(url)
|
||||
|
||||
if source:
|
||||
try:
|
||||
|
@ -23,12 +39,12 @@ async def fetch_and_parse_article(url: str):
|
|||
article.set_html(source)
|
||||
article.parse()
|
||||
|
||||
# Extract text with trafilatura but without markdown to avoid its line breaking
|
||||
# Extract text with trafilatura without markdown to avoid its line breaking
|
||||
raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
|
||||
if raw_text:
|
||||
article.text = raw_text
|
||||
|
||||
# Update other article properties with trafilatura data
|
||||
# Update article properties using trafilatura data
|
||||
article.title = article.title or traf.title or url
|
||||
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
|
||||
article.publish_date = traf.date or datetime.now()
|
||||
|
@ -38,13 +54,16 @@ async def fetch_and_parse_article(url: str):
|
|||
return article
|
||||
except Exception as e:
|
||||
print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)
|
||||
pass
|
||||
|
||||
# Fallback to newspaper3k
|
||||
# Fallback to newspaper3k with headers
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/115.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Referer': 'https://www.google.com/'
|
||||
}
|
||||
|
||||
article = Article(url)
|
||||
|
@ -86,10 +105,7 @@ def format_article_markdown(article) -> str:
|
|||
|
||||
# Format article text with improved paragraph handling
|
||||
if article.text:
|
||||
# Clean and normalize the text first
|
||||
clean_content = clean_text(article.text)
|
||||
|
||||
# Just split on double newlines and don't use textwrap
|
||||
output += clean_content
|
||||
|
||||
return output
|
||||
|
@ -110,3 +126,4 @@ async def main():
|
|||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
|
|
80
shredr
Executable file
80
shredr
Executable file
|
@ -0,0 +1,80 @@
|
|||
#!/bin/bash
|
||||
|
||||
# -------------------------------------------------
|
||||
# secure_delete.sh
|
||||
# Description:
|
||||
# This script securely deletes a file or all files within a directory.
|
||||
# Usage:
|
||||
# ./secure_delete.sh /path/to/file_or_directory
|
||||
# -------------------------------------------------
|
||||
|
||||
# Function to display usage information
|
||||
usage() {
|
||||
echo "Usage: $0 /path/to/file_or_directory"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Check if exactly one argument is provided
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "Error: Exactly one path must be provided."
|
||||
usage
|
||||
fi
|
||||
|
||||
TARGET_PATH="$1"
|
||||
|
||||
# Check if the path exists
|
||||
if [ ! -e "$TARGET_PATH" ]; then
|
||||
echo "Error: The path '$TARGET_PATH' does not exist."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to shred a single file with specified options
|
||||
shred_file() {
|
||||
local file="$1"
|
||||
echo "Shredding file: $file"
|
||||
shred -f -v -z -n 9 "$file"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Successfully shredded: $file"
|
||||
else
|
||||
echo "Failed to shred: $file" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Determine if the path is a file or directory
|
||||
if [ -f "$TARGET_PATH" ]; then
|
||||
# It's a regular file
|
||||
shred_file "$TARGET_PATH"
|
||||
|
||||
elif [ -d "$TARGET_PATH" ]; then
|
||||
# It's a directory
|
||||
echo "Detected directory: $TARGET_PATH"
|
||||
echo "Shredding all files within the directory..."
|
||||
|
||||
# Find and shred all regular files within the directory
|
||||
find "$TARGET_PATH" -type f -print0 | while IFS= read -r -d '' file; do
|
||||
shred_file "$file"
|
||||
done
|
||||
|
||||
echo "All files within '$TARGET_PATH' have been shredded."
|
||||
|
||||
# Remove the now-empty directory structure
|
||||
echo "Removing directory: $TARGET_PATH"
|
||||
rm -rf "$TARGET_PATH"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Directory '$TARGET_PATH' has been removed."
|
||||
else
|
||||
echo "Failed to remove directory: $TARGET_PATH" >&2
|
||||
fi
|
||||
|
||||
else
|
||||
# Neither a regular file nor a directory
|
||||
echo "Error: The path '$TARGET_PATH' is neither a regular file nor a directory."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Secure deletion completed successfully."
|
||||
|
||||
exit 0
|
||||
|
Loading…
Add table
Reference in a new issue