From 758b8ef136146bad249e3fa46cb9c9c7020e4c52 Mon Sep 17 00:00:00 2001 From: sanj <67624670+iodrift@users.noreply.github.com> Date: Tue, 11 Feb 2025 14:31:09 -0800 Subject: [PATCH] Auto-update: Tue Feb 11 14:31:09 PST 2025 --- n3k | 111 ++++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 30 deletions(-) diff --git a/n3k b/n3k index 82f824e..bb1785b 100755 --- a/n3k +++ b/n3k @@ -1,50 +1,101 @@ #!/usr/bin/env python3 import sys +import asyncio +import trafilatura from newspaper import Article +from urllib.parse import urlparse +from datetime import datetime +import math +from typing import Optional import textwrap -def format_article(url): +async def fetch_and_parse_article(url: str): + # Try trafilatura first + source = trafilatura.fetch_url(url) + + if source: + try: + traf = trafilatura.extract_metadata(filecontent=source, default_url=url) + + article = Article(url) + article.set_html(source) + article.parse() + + # Update article properties with trafilatura data + article.title = article.title or traf.title or url + article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author]) + article.publish_date = traf.date or datetime.now() + article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text + article.top_image = article.top_image or traf.image + article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title() + + return article + except Exception: + pass + + # Fallback to newspaper3k try: - # Initialize and download the article + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + } + article = Article(url) + article.config.browser_user_agent = headers['User-Agent'] + article.config.headers = headers article.download() article.parse() - # Format the title - output = f"# {article.title}\n\n" - - # Format the authors - if article.authors: - output += f"*{', '.join(article.authors)}*\n\n" - - # Format the text with proper wrapping and paragraph separation - if article.text: - # Split into paragraphs and wrap each one - paragraphs = article.text.split('\n') - wrapped_paragraphs = [] - - for paragraph in paragraphs: - if paragraph.strip(): # Only process non-empty paragraphs - # Wrap text at 80 characters - wrapped = textwrap.fill(paragraph.strip(), width=80) - wrapped_paragraphs.append(wrapped) - - output += '\n\n'.join(wrapped_paragraphs) - - return output + article.source_url = urlparse(url).netloc.replace('www.', '').title() + return article except Exception as e: - return f"Error processing article: {str(e)}" + raise Exception(f"Failed to parse article from {url}: {str(e)}") -def main(): +def format_article_markdown(article) -> str: + # Format title + output = f"# {article.title}\n\n" + + # Format metadata + if article.authors: + authors = article.authors if isinstance(article.authors, list) else [article.authors] + output += f"*By {', '.join(filter(None, authors))}*\n\n" + + if article.publish_date: + date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date) + output += f"*Published: {date_str}*\n\n" + + if article.top_image: + output += f"\n\n" + + # Format article text with proper wrapping + if article.text: + paragraphs = article.text.split('\n') + wrapped_paragraphs = [] + + for paragraph in paragraphs: + if paragraph.strip(): + wrapped = textwrap.fill(paragraph.strip(), width=80) + wrapped_paragraphs.append(wrapped) + + output += '\n\n'.join(wrapped_paragraphs) + + return output + +async def main(): if len(sys.argv) != 2: - print("Usage: ./script.py <article_url>") + print("Usage: ./n3k <article_url>") sys.exit(1) url = sys.argv[1] - formatted_article = format_article(url) - print(formatted_article) + try: + article = await fetch_and_parse_article(url) + formatted_content = format_article_markdown(article) + print(formatted_content) + except Exception as e: + print(f"Error processing article: {str(e)}") + sys.exit(1) if __name__ == "__main__": - main() + asyncio.run(main())