#!/usr/bin/env python3 import sys import asyncio import trafilatura from newspaper import Article from urllib.parse import urlparse from datetime import datetime import math from typing import Optional import textwrap import re async def fetch_and_parse_article(url: str): # Try trafilatura first source = trafilatura.fetch_url(url) if source: try: traf = trafilatura.extract_metadata(filecontent=source, default_url=url) article = Article(url) article.set_html(source) article.parse() # Extract text with trafilatura but without markdown to avoid its line breaking raw_text = trafilatura.extract(source, output_format="text", include_comments=False) if raw_text: article.text = raw_text # Update other article properties with trafilatura data article.title = article.title or traf.title or url article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author]) article.publish_date = traf.date or datetime.now() article.top_image = article.top_image or traf.image article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title() return article except Exception as e: print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr) pass # Fallback to newspaper3k try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', } article = Article(url) article.config.browser_user_agent = headers['User-Agent'] article.config.headers = headers article.download() article.parse() article.source_url = urlparse(url).netloc.replace('www.', '').title() return article except Exception as e: raise Exception(f"Failed to parse article from {url}: {str(e)}") def clean_text(text: str) -> str: # Convert paragraph breaks to a special marker text = re.sub(r'\n\s*\n', '¶¶', text) # Convert all other whitespace (including single newlines) to single spaces text = re.sub(r'\s+', ' ', text) # Convert markers back to double newlines text = text.replace('¶¶', '\n\n') return text.strip() def format_article_markdown(article) -> str: # Format title output = f"# {article.title}\n\n" # Format metadata if article.authors: authors = article.authors if isinstance(article.authors, list) else [article.authors] output += f"*By {', '.join(filter(None, authors))}*\n\n" if article.publish_date: date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date) output += f"*Published: {date_str}*\n\n" if article.top_image: output += f"\n\n" # Format article text with improved paragraph handling if article.text: # Clean and normalize the text first clean_content = clean_text(article.text) # Just split on double newlines and don't use textwrap output += clean_content return output async def main(): if len(sys.argv) != 2: print("Usage: ./n3k <article_url>") sys.exit(1) url = sys.argv[1] try: article = await fetch_and_parse_article(url) formatted_content = format_article_markdown(article) print(formatted_content) except Exception as e: print(f"Error processing article: {str(e)}") sys.exit(1) if __name__ == "__main__": asyncio.run(main())