112 lines
3.8 KiB
Python
Executable file
112 lines
3.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
import asyncio
|
|
import trafilatura
|
|
from newspaper import Article
|
|
from urllib.parse import urlparse
|
|
from datetime import datetime
|
|
import math
|
|
from typing import Optional
|
|
import textwrap
|
|
import re
|
|
|
|
async def fetch_and_parse_article(url: str):
|
|
# Try trafilatura first
|
|
source = trafilatura.fetch_url(url)
|
|
|
|
if source:
|
|
try:
|
|
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
|
|
|
|
article = Article(url)
|
|
article.set_html(source)
|
|
article.parse()
|
|
|
|
# Extract text with trafilatura but without markdown to avoid its line breaking
|
|
raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
|
|
if raw_text:
|
|
article.text = raw_text
|
|
|
|
# Update other article properties with trafilatura data
|
|
article.title = article.title or traf.title or url
|
|
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
|
|
article.publish_date = traf.date or datetime.now()
|
|
article.top_image = article.top_image or traf.image
|
|
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
|
|
|
|
return article
|
|
except Exception as e:
|
|
print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)
|
|
pass
|
|
|
|
# Fallback to newspaper3k
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
}
|
|
|
|
article = Article(url)
|
|
article.config.browser_user_agent = headers['User-Agent']
|
|
article.config.headers = headers
|
|
article.download()
|
|
article.parse()
|
|
|
|
article.source_url = urlparse(url).netloc.replace('www.', '').title()
|
|
return article
|
|
|
|
except Exception as e:
|
|
raise Exception(f"Failed to parse article from {url}: {str(e)}")
|
|
|
|
def clean_text(text: str) -> str:
|
|
# Convert paragraph breaks to a special marker
|
|
text = re.sub(r'\n\s*\n', '¶¶', text)
|
|
# Convert all other whitespace (including single newlines) to single spaces
|
|
text = re.sub(r'\s+', ' ', text)
|
|
# Convert markers back to double newlines
|
|
text = text.replace('¶¶', '\n\n')
|
|
return text.strip()
|
|
|
|
def format_article_markdown(article) -> str:
|
|
# Format title
|
|
output = f"# {article.title}\n\n"
|
|
|
|
# Format metadata
|
|
if article.authors:
|
|
authors = article.authors if isinstance(article.authors, list) else [article.authors]
|
|
output += f"*By {', '.join(filter(None, authors))}*\n\n"
|
|
|
|
if article.publish_date:
|
|
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
|
|
output += f"*Published: {date_str}*\n\n"
|
|
|
|
if article.top_image:
|
|
output += f"\n\n"
|
|
|
|
# Format article text with improved paragraph handling
|
|
if article.text:
|
|
# Clean and normalize the text first
|
|
clean_content = clean_text(article.text)
|
|
|
|
# Just split on double newlines and don't use textwrap
|
|
output += clean_content
|
|
|
|
return output
|
|
|
|
async def main():
|
|
if len(sys.argv) != 2:
|
|
print("Usage: ./n3k <article_url>")
|
|
sys.exit(1)
|
|
|
|
url = sys.argv[1]
|
|
try:
|
|
article = await fetch_and_parse_article(url)
|
|
formatted_content = format_article_markdown(article)
|
|
print(formatted_content)
|
|
except Exception as e:
|
|
print(f"Error processing article: {str(e)}")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|