#!/usr/bin/env python3 import sys import asyncio import trafilatura import requests from newspaper import Article from urllib.parse import urlparse from datetime import datetime from typing import Optional import re def fetch_url_with_headers(url: str) -> Optional[str]: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/115.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/' } try: response = requests.get(url, headers=headers) response.raise_for_status() return response.text except Exception as e: print(f"Error fetching URL with headers: {str(e)}", file=sys.stderr) return None async def fetch_and_parse_article(url: str): # First try using trafilatura with custom headers source = fetch_url_with_headers(url) if source: try: traf = trafilatura.extract_metadata(filecontent=source, default_url=url) article = Article(url) article.set_html(source) article.parse() # Extract text with trafilatura without markdown to avoid its line breaking raw_text = trafilatura.extract(source, output_format="text", include_comments=False) if raw_text: article.text = raw_text # Update article properties using trafilatura data article.title = article.title or traf.title or url article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author]) article.publish_date = traf.date or datetime.now() article.top_image = article.top_image or traf.image article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title() return article except Exception as e: print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr) # Fallback to newspaper3k with headers try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/115.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/' } article = Article(url) article.config.browser_user_agent = headers['User-Agent'] article.config.headers = headers article.download() article.parse() article.source_url = urlparse(url).netloc.replace('www.', '').title() return article except Exception as e: raise Exception(f"Failed to parse article from {url}: {str(e)}") def clean_text(text: str) -> str: # Convert paragraph breaks to a special marker text = re.sub(r'\n\s*\n', '¶¶', text) # Convert all other whitespace (including single newlines) to single spaces text = re.sub(r'\s+', ' ', text) # Convert markers back to double newlines text = text.replace('¶¶', '\n\n') return text.strip() def format_article_markdown(article) -> str: # Format title output = f"# {article.title}\n\n" # Format metadata if article.authors: authors = article.authors if isinstance(article.authors, list) else [article.authors] output += f"*By {', '.join(filter(None, authors))}*\n\n" if article.publish_date: date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date) output += f"*Published: {date_str}*\n\n" if article.top_image: output += f"\n\n" # Format article text with improved paragraph handling if article.text: clean_content = clean_text(article.text) output += clean_content return output async def main(): if len(sys.argv) != 2: print("Usage: ./n3k <article_url>") sys.exit(1) url = sys.argv[1] try: article = await fetch_and_parse_article(url) formatted_content = format_article_markdown(article) print(formatted_content) except Exception as e: print(f"Error processing article: {str(e)}") sys.exit(1) if __name__ == "__main__": asyncio.run(main())