pathScripts/n3k

#!/usr/bin/env python3

import sys
import asyncio
import trafilatura
import requests
from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
from typing import Optional
import re

def fetch_url_with_headers(url: str) -> Optional[str]:
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/115.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.google.com/'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error fetching URL with headers: {str(e)}", file=sys.stderr)
    return None

async def fetch_and_parse_article(url: str):
    # First try using trafilatura with custom headers
    source = fetch_url_with_headers(url)

    if source:
        try:
            traf = trafilatura.extract_metadata(filecontent=source, default_url=url)

            article = Article(url)
            article.set_html(source)
            article.parse()

            # Extract text with trafilatura without markdown to avoid its line breaking
            raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
            if raw_text:
                article.text = raw_text

            # Update article properties using trafilatura data
            article.title = article.title or traf.title or url
            article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
            article.publish_date = traf.date or datetime.now()
            article.top_image = article.top_image or traf.image
            article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()

            return article
        except Exception as e:
            print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)

    # Fallback to newspaper3k with headers
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/115.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.google.com/'
        }

        article = Article(url)
        article.config.browser_user_agent = headers['User-Agent']
        article.config.headers = headers
        article.download()
        article.parse()

        article.source_url = urlparse(url).netloc.replace('www.', '').title()
        return article

    except Exception as e:
        raise Exception(f"Failed to parse article from {url}: {str(e)}")

def clean_text(text: str) -> str:
    # Convert paragraph breaks to a special marker
    text = re.sub(r'\n\s*\n', '¶¶', text)
    # Convert all other whitespace (including single newlines) to single spaces
    text = re.sub(r'\s+', ' ', text)
    # Convert markers back to double newlines
    text = text.replace('¶¶', '\n\n')
    return text.strip()

def format_article_markdown(article) -> str:
    # Format title
    output = f"# {article.title}\n\n"

    # Format metadata
    if article.authors:
        authors = article.authors if isinstance(article.authors, list) else [article.authors]
        output += f"*By {', '.join(filter(None, authors))}*\n\n"

    if article.publish_date:
        date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
        output += f"*Published: {date_str}*\n\n"

    if article.top_image:
        output += f"![Article Image]({article.top_image})\n\n"

    # Format article text with improved paragraph handling
    if article.text:
        clean_content = clean_text(article.text)
        output += clean_content

    return output

async def main():
    if len(sys.argv) != 2:
        print("Usage: ./n3k <article_url>")
        sys.exit(1)

    url = sys.argv[1]
    try:
        article = await fetch_and_parse_article(url)
        formatted_content = format_article_markdown(article)
        print(formatted_content)
    except Exception as e:
        print(f"Error processing article: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    asyncio.run(main())
No results found.