Auto-update: Tue Feb 11 14:31:09 PST 2025

2025-02-11 14:31:09 -08:00 · 2025-02-11 14:31:09 -08:00 · 758b8ef136
commit 758b8ef136
parent 34b8a95623
1 changed files with 81 additions and 30 deletions
--- a/111
+++ b/111
@ -1,50 +1,101 @@
 #!/usr/bin/env python3

 import sys
+import asyncio
+import trafilatura
 from newspaper import Article
+from urllib.parse import urlparse
+from datetime import datetime
+import math
+from typing import Optional
 import textwrap

-def format_article(url):
+async def fetch_and_parse_article(url: str):
+    # Try trafilatura first
+    source = trafilatura.fetch_url(url)
+    
+    if source:
+        try:
+            traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
+            
+            article = Article(url)
+            article.set_html(source)
+            article.parse()
+            
+            # Update article properties with trafilatura data
+            article.title = article.title or traf.title or url
+            article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
+            article.publish_date = traf.date or datetime.now()
+            article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
+            article.top_image = article.top_image or traf.image
+            article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
+            
+            return article
+        except Exception:
+            pass
+    
+    # Fallback to newspaper3k
    try:
-        # Initialize and download the article
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        }
+        
        article = Article(url)
+        article.config.browser_user_agent = headers['User-Agent']
+        article.config.headers = headers
        article.download()
        article.parse()
        
-        # Format the title
-        output = f"# {article.title}\n\n"
-        
-        # Format the authors
-        if article.authors:
-            output += f"*{', '.join(article.authors)}*\n\n"
-        
-        # Format the text with proper wrapping and paragraph separation
-        if article.text:
-            # Split into paragraphs and wrap each one
-            paragraphs = article.text.split('\n')
-            wrapped_paragraphs = []
-            
-            for paragraph in paragraphs:
-                if paragraph.strip():  # Only process non-empty paragraphs
-                    # Wrap text at 80 characters
-                    wrapped = textwrap.fill(paragraph.strip(), width=80)
-                    wrapped_paragraphs.append(wrapped)
-            
-            output += '\n\n'.join(wrapped_paragraphs)
-        
-        return output
+        article.source_url = urlparse(url).netloc.replace('www.', '').title()
+        return article
    
    except Exception as e:
-        return f"Error processing article: {str(e)}"
+        raise Exception(f"Failed to parse article from {url}: {str(e)}")

-def main():
+def format_article_markdown(article) -> str:
+    # Format title
+    output = f"# {article.title}\n\n"
+    
+    # Format metadata
+    if article.authors:
+        authors = article.authors if isinstance(article.authors, list) else [article.authors]
+        output += f"*By {', '.join(filter(None, authors))}*\n\n"
+    
+    if article.publish_date:
+        date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
+        output += f"*Published: {date_str}*\n\n"
+    
+    if article.top_image:
+        output += f"![Article Image]({article.top_image})\n\n"
+    
+    # Format article text with proper wrapping
+    if article.text:
+        paragraphs = article.text.split('\n')
+        wrapped_paragraphs = []
+        
+        for paragraph in paragraphs:
+            if paragraph.strip():
+                wrapped = textwrap.fill(paragraph.strip(), width=80)
+                wrapped_paragraphs.append(wrapped)
+        
+        output += '\n\n'.join(wrapped_paragraphs)
+    
+    return output
+
+async def main():
    if len(sys.argv) != 2:
-        print("Usage: ./script.py <article_url>")
+        print("Usage: ./n3k <article_url>")
        sys.exit(1)
    
    url = sys.argv[1]
-    formatted_article = format_article(url)
-    print(formatted_article)
+    try:
+        article = await fetch_and_parse_article(url)
+        formatted_content = format_article_markdown(article)
+        print(formatted_content)
+    except Exception as e:
+        print(f"Error processing article: {str(e)}")
+        sys.exit(1)

 if __name__ == "__main__":
-    main()
+    asyncio.run(main())