#!/usr/bin/env python3 import sys import asyncio import trafilatura from newspaper import Article from urllib.parse import urlparse from datetime import datetime import math from typing import Optional import textwrap async def fetch_and_parse_article(url: str): # Try trafilatura first source = trafilatura.fetch_url(url) if source: try: traf = trafilatura.extract_metadata(filecontent=source, default_url=url) article = Article(url) article.set_html(source) article.parse() # Update article properties with trafilatura data article.title = article.title or traf.title or url article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author]) article.publish_date = traf.date or datetime.now() article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text article.top_image = article.top_image or traf.image article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title() return article except Exception: pass # Fallback to newspaper3k try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', } article = Article(url) article.config.browser_user_agent = headers['User-Agent'] article.config.headers = headers article.download() article.parse() article.source_url = urlparse(url).netloc.replace('www.', '').title() return article except Exception as e: raise Exception(f"Failed to parse article from {url}: {str(e)}") def format_article_markdown(article) -> str: # Format title output = f"# {article.title}\n\n" # Format metadata if article.authors: authors = article.authors if isinstance(article.authors, list) else [article.authors] output += f"*By {', '.join(filter(None, authors))}*\n\n" if article.publish_date: date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date) output += f"*Published: {date_str}*\n\n" if article.top_image: output += f"\n\n" # Format article text with proper wrapping if article.text: paragraphs = article.text.split('\n') wrapped_paragraphs = [] for paragraph in paragraphs: if paragraph.strip(): wrapped = textwrap.fill(paragraph.strip(), width=80) wrapped_paragraphs.append(wrapped) output += '\n\n'.join(wrapped_paragraphs) return output async def main(): if len(sys.argv) != 2: print("Usage: ./n3k <article_url>") sys.exit(1) url = sys.argv[1] try: article = await fetch_and_parse_article(url) formatted_content = format_article_markdown(article) print(formatted_content) except Exception as e: print(f"Error processing article: {str(e)}") sys.exit(1) if __name__ == "__main__": asyncio.run(main())