Auto-update: Tue Feb 11 14:31:09 PST 2025

2025-02-11 14:31:09 -08:00 · 2025-02-11 14:31:09 -08:00 · 758b8ef136
commit 758b8ef136
parent 34b8a95623
1 changed files with 81 additions and 30 deletions
--- a/85
+++ b/85
@ -1,32 +1,81 @@
 #!/usr/bin/env python3
 import sys
 import asyncio
 import trafilatura
 from newspaper import Article
 from urllib.parse import urlparse
 from datetime import datetime
 import math
 from typing import Optional
 import textwrap
-def format_article(url):
+async def fetch_and_parse_article(url: str):
    # Try trafilatura first
    source = trafilatura.fetch_url(url)
    if source:
        try:
-        # Initialize and download the article
+            traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
            article = Article(url)
            article.set_html(source)
            article.parse()
            # Update article properties with trafilatura data
            article.title = article.title or traf.title or url
            article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
            article.publish_date = traf.date or datetime.now()
            article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
            article.top_image = article.top_image or traf.image
            article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
            return article
        except Exception:
            pass
    # Fallback to newspaper3k
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        }
        article = Article(url)
        article.config.browser_user_agent = headers['User-Agent']
        article.config.headers = headers
        article.download()
        article.parse()
-        # Format the title
+        article.source_url = urlparse(url).netloc.replace('www.', '').title()
        return article
    except Exception as e:
        raise Exception(f"Failed to parse article from {url}: {str(e)}")
 def format_article_markdown(article) -> str:
    # Format title
    output = f"# {article.title}\n\n"
-        # Format the authors
+    # Format metadata
    if article.authors:
-            output += f"*{', '.join(article.authors)}*\n\n"
+        authors = article.authors if isinstance(article.authors, list) else [article.authors]
        output += f"*By {', '.join(filter(None, authors))}*\n\n"
-        # Format the text with proper wrapping and paragraph separation
+    if article.publish_date:
        date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
        output += f"*Published: {date_str}*\n\n"
    if article.top_image:
        output += f"![Article Image]({article.top_image})\n\n"
    # Format article text with proper wrapping
    if article.text:
            # Split into paragraphs and wrap each one
        paragraphs = article.text.split('\n')
        wrapped_paragraphs = []
        for paragraph in paragraphs:
-                if paragraph.strip():  # Only process non-empty paragraphs
+            if paragraph.strip():
                    # Wrap text at 80 characters
                wrapped = textwrap.fill(paragraph.strip(), width=80)
                wrapped_paragraphs.append(wrapped)
@ -34,17 +83,19 @@ def format_article(url):
    return output
-    except Exception as e:
+async def main():
        return f"Error processing article: {str(e)}"
 def main():
    if len(sys.argv) != 2:
-        print("Usage: ./script.py <article_url>")
+        print("Usage: ./n3k <article_url>")
        sys.exit(1)
    url = sys.argv[1]
-    formatted_article = format_article(url)
+    try:
-    print(formatted_article)
+        article = await fetch_and_parse_article(url)
        formatted_content = format_article_markdown(article)
        print(formatted_content)
    except Exception as e:
        print(f"Error processing article: {str(e)}")
        sys.exit(1)
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())