Auto-update: Tue Feb 11 14:31:09 PST 2025

This commit is contained in:
sanj 2025-02-11 14:31:09 -08:00
parent 34b8a95623
commit 758b8ef136

85
n3k
View file

@ -1,32 +1,81 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys import sys
import asyncio
import trafilatura
from newspaper import Article from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
import math
from typing import Optional
import textwrap import textwrap
def format_article(url): async def fetch_and_parse_article(url: str):
# Try trafilatura first
source = trafilatura.fetch_url(url)
if source:
try: try:
# Initialize and download the article traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
article = Article(url) article = Article(url)
article.set_html(source)
article.parse()
# Update article properties with trafilatura data
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = traf.date or datetime.now()
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
article.top_image = article.top_image or traf.image
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
return article
except Exception:
pass
# Fallback to newspaper3k
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
article = Article(url)
article.config.browser_user_agent = headers['User-Agent']
article.config.headers = headers
article.download() article.download()
article.parse() article.parse()
# Format the title article.source_url = urlparse(url).netloc.replace('www.', '').title()
return article
except Exception as e:
raise Exception(f"Failed to parse article from {url}: {str(e)}")
def format_article_markdown(article) -> str:
# Format title
output = f"# {article.title}\n\n" output = f"# {article.title}\n\n"
# Format the authors # Format metadata
if article.authors: if article.authors:
output += f"*{', '.join(article.authors)}*\n\n" authors = article.authors if isinstance(article.authors, list) else [article.authors]
output += f"*By {', '.join(filter(None, authors))}*\n\n"
# Format the text with proper wrapping and paragraph separation if article.publish_date:
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
output += f"*Published: {date_str}*\n\n"
if article.top_image:
output += f"![Article Image]({article.top_image})\n\n"
# Format article text with proper wrapping
if article.text: if article.text:
# Split into paragraphs and wrap each one
paragraphs = article.text.split('\n') paragraphs = article.text.split('\n')
wrapped_paragraphs = [] wrapped_paragraphs = []
for paragraph in paragraphs: for paragraph in paragraphs:
if paragraph.strip(): # Only process non-empty paragraphs if paragraph.strip():
# Wrap text at 80 characters
wrapped = textwrap.fill(paragraph.strip(), width=80) wrapped = textwrap.fill(paragraph.strip(), width=80)
wrapped_paragraphs.append(wrapped) wrapped_paragraphs.append(wrapped)
@ -34,17 +83,19 @@ def format_article(url):
return output return output
except Exception as e: async def main():
return f"Error processing article: {str(e)}"
def main():
if len(sys.argv) != 2: if len(sys.argv) != 2:
print("Usage: ./script.py <article_url>") print("Usage: ./n3k <article_url>")
sys.exit(1) sys.exit(1)
url = sys.argv[1] url = sys.argv[1]
formatted_article = format_article(url) try:
print(formatted_article) article = await fetch_and_parse_article(url)
formatted_content = format_article_markdown(article)
print(formatted_content)
except Exception as e:
print(f"Error processing article: {str(e)}")
sys.exit(1)
if __name__ == "__main__": if __name__ == "__main__":
main() asyncio.run(main())