Auto-update: Tue Feb 11 14:31:09 PST 2025

This commit is contained in:
sanj 2025-02-11 14:31:09 -08:00
parent 34b8a95623
commit 758b8ef136

111
n3k
View file

@ -1,50 +1,101 @@
#!/usr/bin/env python3
import sys
import asyncio
import trafilatura
from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
import math
from typing import Optional
import textwrap
def format_article(url):
async def fetch_and_parse_article(url: str):
# Try trafilatura first
source = trafilatura.fetch_url(url)
if source:
try:
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
article = Article(url)
article.set_html(source)
article.parse()
# Update article properties with trafilatura data
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = traf.date or datetime.now()
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
article.top_image = article.top_image or traf.image
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
return article
except Exception:
pass
# Fallback to newspaper3k
try:
# Initialize and download the article
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
article = Article(url)
article.config.browser_user_agent = headers['User-Agent']
article.config.headers = headers
article.download()
article.parse()
# Format the title
output = f"# {article.title}\n\n"
# Format the authors
if article.authors:
output += f"*{', '.join(article.authors)}*\n\n"
# Format the text with proper wrapping and paragraph separation
if article.text:
# Split into paragraphs and wrap each one
paragraphs = article.text.split('\n')
wrapped_paragraphs = []
for paragraph in paragraphs:
if paragraph.strip(): # Only process non-empty paragraphs
# Wrap text at 80 characters
wrapped = textwrap.fill(paragraph.strip(), width=80)
wrapped_paragraphs.append(wrapped)
output += '\n\n'.join(wrapped_paragraphs)
return output
article.source_url = urlparse(url).netloc.replace('www.', '').title()
return article
except Exception as e:
return f"Error processing article: {str(e)}"
raise Exception(f"Failed to parse article from {url}: {str(e)}")
def main():
def format_article_markdown(article) -> str:
# Format title
output = f"# {article.title}\n\n"
# Format metadata
if article.authors:
authors = article.authors if isinstance(article.authors, list) else [article.authors]
output += f"*By {', '.join(filter(None, authors))}*\n\n"
if article.publish_date:
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
output += f"*Published: {date_str}*\n\n"
if article.top_image:
output += f"![Article Image]({article.top_image})\n\n"
# Format article text with proper wrapping
if article.text:
paragraphs = article.text.split('\n')
wrapped_paragraphs = []
for paragraph in paragraphs:
if paragraph.strip():
wrapped = textwrap.fill(paragraph.strip(), width=80)
wrapped_paragraphs.append(wrapped)
output += '\n\n'.join(wrapped_paragraphs)
return output
async def main():
if len(sys.argv) != 2:
print("Usage: ./script.py <article_url>")
print("Usage: ./n3k <article_url>")
sys.exit(1)
url = sys.argv[1]
formatted_article = format_article(url)
print(formatted_article)
try:
article = await fetch_and_parse_article(url)
formatted_content = format_article_markdown(article)
print(formatted_content)
except Exception as e:
print(f"Error processing article: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
main()
asyncio.run(main())