Auto-update: Tue Feb 11 14:31:09 PST 2025

This commit is contained in:
sanj 2025-02-11 14:31:09 -08:00
parent 34b8a95623
commit 758b8ef136

111
n3k
View file

@ -1,50 +1,101 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys import sys
import asyncio
import trafilatura
from newspaper import Article from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
import math
from typing import Optional
import textwrap import textwrap
def format_article(url): async def fetch_and_parse_article(url: str):
# Try trafilatura first
source = trafilatura.fetch_url(url)
if source:
try:
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
article = Article(url)
article.set_html(source)
article.parse()
# Update article properties with trafilatura data
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = traf.date or datetime.now()
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
article.top_image = article.top_image or traf.image
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
return article
except Exception:
pass
# Fallback to newspaper3k
try: try:
# Initialize and download the article headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
article = Article(url) article = Article(url)
article.config.browser_user_agent = headers['User-Agent']
article.config.headers = headers
article.download() article.download()
article.parse() article.parse()
# Format the title article.source_url = urlparse(url).netloc.replace('www.', '').title()
output = f"# {article.title}\n\n" return article
# Format the authors
if article.authors:
output += f"*{', '.join(article.authors)}*\n\n"
# Format the text with proper wrapping and paragraph separation
if article.text:
# Split into paragraphs and wrap each one
paragraphs = article.text.split('\n')
wrapped_paragraphs = []
for paragraph in paragraphs:
if paragraph.strip(): # Only process non-empty paragraphs
# Wrap text at 80 characters
wrapped = textwrap.fill(paragraph.strip(), width=80)
wrapped_paragraphs.append(wrapped)
output += '\n\n'.join(wrapped_paragraphs)
return output
except Exception as e: except Exception as e:
return f"Error processing article: {str(e)}" raise Exception(f"Failed to parse article from {url}: {str(e)}")
def main(): def format_article_markdown(article) -> str:
# Format title
output = f"# {article.title}\n\n"
# Format metadata
if article.authors:
authors = article.authors if isinstance(article.authors, list) else [article.authors]
output += f"*By {', '.join(filter(None, authors))}*\n\n"
if article.publish_date:
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
output += f"*Published: {date_str}*\n\n"
if article.top_image:
output += f"![Article Image]({article.top_image})\n\n"
# Format article text with proper wrapping
if article.text:
paragraphs = article.text.split('\n')
wrapped_paragraphs = []
for paragraph in paragraphs:
if paragraph.strip():
wrapped = textwrap.fill(paragraph.strip(), width=80)
wrapped_paragraphs.append(wrapped)
output += '\n\n'.join(wrapped_paragraphs)
return output
async def main():
if len(sys.argv) != 2: if len(sys.argv) != 2:
print("Usage: ./script.py <article_url>") print("Usage: ./n3k <article_url>")
sys.exit(1) sys.exit(1)
url = sys.argv[1] url = sys.argv[1]
formatted_article = format_article(url) try:
print(formatted_article) article = await fetch_and_parse_article(url)
formatted_content = format_article_markdown(article)
print(formatted_content)
except Exception as e:
print(f"Error processing article: {str(e)}")
sys.exit(1)
if __name__ == "__main__": if __name__ == "__main__":
main() asyncio.run(main())