pathScripts/n3k

129 lines
4.6 KiB
Python
Executable file

#!/usr/bin/env python3
import sys
import asyncio
import trafilatura
import requests
from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
from typing import Optional
import re
def fetch_url_with_headers(url: str) -> Optional[str]:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/115.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/'
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
print(f"Error fetching URL with headers: {str(e)}", file=sys.stderr)
return None
async def fetch_and_parse_article(url: str):
# First try using trafilatura with custom headers
source = fetch_url_with_headers(url)
if source:
try:
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
article = Article(url)
article.set_html(source)
article.parse()
# Extract text with trafilatura without markdown to avoid its line breaking
raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
if raw_text:
article.text = raw_text
# Update article properties using trafilatura data
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = traf.date or datetime.now()
article.top_image = article.top_image or traf.image
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
return article
except Exception as e:
print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)
# Fallback to newspaper3k with headers
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/115.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/'
}
article = Article(url)
article.config.browser_user_agent = headers['User-Agent']
article.config.headers = headers
article.download()
article.parse()
article.source_url = urlparse(url).netloc.replace('www.', '').title()
return article
except Exception as e:
raise Exception(f"Failed to parse article from {url}: {str(e)}")
def clean_text(text: str) -> str:
# Convert paragraph breaks to a special marker
text = re.sub(r'\n\s*\n', '¶¶', text)
# Convert all other whitespace (including single newlines) to single spaces
text = re.sub(r'\s+', ' ', text)
# Convert markers back to double newlines
text = text.replace('¶¶', '\n\n')
return text.strip()
def format_article_markdown(article) -> str:
# Format title
output = f"# {article.title}\n\n"
# Format metadata
if article.authors:
authors = article.authors if isinstance(article.authors, list) else [article.authors]
output += f"*By {', '.join(filter(None, authors))}*\n\n"
if article.publish_date:
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
output += f"*Published: {date_str}*\n\n"
if article.top_image:
output += f"![Article Image]({article.top_image})\n\n"
# Format article text with improved paragraph handling
if article.text:
clean_content = clean_text(article.text)
output += clean_content
return output
async def main():
if len(sys.argv) != 2:
print("Usage: ./n3k <article_url>")
sys.exit(1)
url = sys.argv[1]
try:
article = await fetch_and_parse_article(url)
formatted_content = format_article_markdown(article)
print(formatted_content)
except Exception as e:
print(f"Error processing article: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())