pathScripts/n3k

101 lines
3.3 KiB
Python
Executable file

#!/usr/bin/env python3
import sys
import asyncio
import trafilatura
from newspaper import Article
from urllib.parse import urlparse
from datetime import datetime
import math
from typing import Optional
import textwrap
async def fetch_and_parse_article(url: str):
# Try trafilatura first
source = trafilatura.fetch_url(url)
if source:
try:
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
article = Article(url)
article.set_html(source)
article.parse()
# Update article properties with trafilatura data
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = traf.date or datetime.now()
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
article.top_image = article.top_image or traf.image
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
return article
except Exception:
pass
# Fallback to newspaper3k
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
article = Article(url)
article.config.browser_user_agent = headers['User-Agent']
article.config.headers = headers
article.download()
article.parse()
article.source_url = urlparse(url).netloc.replace('www.', '').title()
return article
except Exception as e:
raise Exception(f"Failed to parse article from {url}: {str(e)}")
def format_article_markdown(article) -> str:
# Format title
output = f"# {article.title}\n\n"
# Format metadata
if article.authors:
authors = article.authors if isinstance(article.authors, list) else [article.authors]
output += f"*By {', '.join(filter(None, authors))}*\n\n"
if article.publish_date:
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
output += f"*Published: {date_str}*\n\n"
if article.top_image:
output += f"![Article Image]({article.top_image})\n\n"
# Format article text with proper wrapping
if article.text:
paragraphs = article.text.split('\n')
wrapped_paragraphs = []
for paragraph in paragraphs:
if paragraph.strip():
wrapped = textwrap.fill(paragraph.strip(), width=80)
wrapped_paragraphs.append(wrapped)
output += '\n\n'.join(wrapped_paragraphs)
return output
async def main():
if len(sys.argv) != 2:
print("Usage: ./n3k <article_url>")
sys.exit(1)
url = sys.argv[1]
try:
article = await fetch_and_parse_article(url)
formatted_content = format_article_markdown(article)
print(formatted_content)
except Exception as e:
print(f"Error processing article: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())