pathScripts/n3k

112 lines
3.8 KiB
Python
Executable file

#!/usr/bin/env python3
import sys
import asyncio
import trafilatura
from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
import math
from typing import Optional
import textwrap
import re
async def fetch_and_parse_article(url: str):
# Try trafilatura first
source = trafilatura.fetch_url(url)
if source:
try:
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
article = Article(url)
article.set_html(source)
article.parse()
# Extract text with trafilatura but without markdown to avoid its line breaking
raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
if raw_text:
article.text = raw_text
# Update other article properties with trafilatura data
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = traf.date or datetime.now()
article.top_image = article.top_image or traf.image
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
return article
except Exception as e:
print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)
pass
# Fallback to newspaper3k
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
article = Article(url)
article.config.browser_user_agent = headers['User-Agent']
article.config.headers = headers
article.download()
article.parse()
article.source_url = urlparse(url).netloc.replace('www.', '').title()
return article
except Exception as e:
raise Exception(f"Failed to parse article from {url}: {str(e)}")
def clean_text(text: str) -> str:
# Convert paragraph breaks to a special marker
text = re.sub(r'\n\s*\n', '¶¶', text)
# Convert all other whitespace (including single newlines) to single spaces
text = re.sub(r'\s+', ' ', text)
# Convert markers back to double newlines
text = text.replace('¶¶', '\n\n')
return text.strip()
def format_article_markdown(article) -> str:
# Format title
output = f"# {article.title}\n\n"
# Format metadata
if article.authors:
authors = article.authors if isinstance(article.authors, list) else [article.authors]
output += f"*By {', '.join(filter(None, authors))}*\n\n"
if article.publish_date:
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
output += f"*Published: {date_str}*\n\n"
if article.top_image:
output += f"![Article Image]({article.top_image})\n\n"
# Format article text with improved paragraph handling
if article.text:
# Clean and normalize the text first
clean_content = clean_text(article.text)
# Just split on double newlines and don't use textwrap
output += clean_content
return output
async def main():
if len(sys.argv) != 2:
print("Usage: ./n3k <article_url>")
sys.exit(1)
url = sys.argv[1]
try:
article = await fetch_and_parse_article(url)
formatted_content = format_article_markdown(article)
print(formatted_content)
except Exception as e:
print(f"Error processing article: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())