pathScripts/n3k

#!/usr/bin/env python3

import sys
import asyncio
import trafilatura
from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
import math
from typing import Optional
import textwrap
import re

async def fetch_and_parse_article(url: str):
    # Try trafilatura first
    source = trafilatura.fetch_url(url)

    if source:
        try:
            traf = trafilatura.extract_metadata(filecontent=source, default_url=url)

            article = Article(url)
            article.set_html(source)
            article.parse()

            # Extract text with trafilatura but without markdown to avoid its line breaking
            raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
            if raw_text:
                article.text = raw_text

            # Update other article properties with trafilatura data
            article.title = article.title or traf.title or url
            article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
            article.publish_date = traf.date or datetime.now()
            article.top_image = article.top_image or traf.image
            article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()

            return article
        except Exception as e:
            print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)
            pass

    # Fallback to newspaper3k
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        }

        article = Article(url)
        article.config.browser_user_agent = headers['User-Agent']
        article.config.headers = headers
        article.download()
        article.parse()

        article.source_url = urlparse(url).netloc.replace('www.', '').title()
        return article

    except Exception as e:
        raise Exception(f"Failed to parse article from {url}: {str(e)}")

def clean_text(text: str) -> str:
    # Convert paragraph breaks to a special marker
    text = re.sub(r'\n\s*\n', '¶¶', text)
    # Convert all other whitespace (including single newlines) to single spaces
    text = re.sub(r'\s+', ' ', text)
    # Convert markers back to double newlines
    text = text.replace('¶¶', '\n\n')
    return text.strip()

def format_article_markdown(article) -> str:
    # Format title
    output = f"# {article.title}\n\n"

    # Format metadata
    if article.authors:
        authors = article.authors if isinstance(article.authors, list) else [article.authors]
        output += f"*By {', '.join(filter(None, authors))}*\n\n"

    if article.publish_date:
        date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
        output += f"*Published: {date_str}*\n\n"

    if article.top_image:
        output += f"![Article Image]({article.top_image})\n\n"

    # Format article text with improved paragraph handling
    if article.text:
        # Clean and normalize the text first
        clean_content = clean_text(article.text)

        # Just split on double newlines and don't use textwrap
        output += clean_content

    return output

async def main():
    if len(sys.argv) != 2:
        print("Usage: ./n3k <article_url>")
        sys.exit(1)

    url = sys.argv[1]
    try:
        article = await fetch_and_parse_article(url)
        formatted_content = format_article_markdown(article)
        print(formatted_content)
    except Exception as e:
        print(f"Error processing article: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    asyncio.run(main())
No results found.