129 lines
4.6 KiB
Python
Executable file
129 lines
4.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
import asyncio
|
|
import trafilatura
|
|
import requests
|
|
from newspaper import Article
|
|
from urllib.parse import urlparse
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
import re
|
|
|
|
def fetch_url_with_headers(url: str) -> Optional[str]:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
|
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/115.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Referer': 'https://www.google.com/'
|
|
}
|
|
try:
|
|
response = requests.get(url, headers=headers)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except Exception as e:
|
|
print(f"Error fetching URL with headers: {str(e)}", file=sys.stderr)
|
|
return None
|
|
|
|
async def fetch_and_parse_article(url: str):
|
|
# First try using trafilatura with custom headers
|
|
source = fetch_url_with_headers(url)
|
|
|
|
if source:
|
|
try:
|
|
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
|
|
|
|
article = Article(url)
|
|
article.set_html(source)
|
|
article.parse()
|
|
|
|
# Extract text with trafilatura without markdown to avoid its line breaking
|
|
raw_text = trafilatura.extract(source, output_format="text", include_comments=False)
|
|
if raw_text:
|
|
article.text = raw_text
|
|
|
|
# Update article properties using trafilatura data
|
|
article.title = article.title or traf.title or url
|
|
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
|
|
article.publish_date = traf.date or datetime.now()
|
|
article.top_image = article.top_image or traf.image
|
|
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
|
|
|
|
return article
|
|
except Exception as e:
|
|
print(f"Trafilatura extraction failed: {str(e)}", file=sys.stderr)
|
|
|
|
# Fallback to newspaper3k with headers
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
|
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/115.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Referer': 'https://www.google.com/'
|
|
}
|
|
|
|
article = Article(url)
|
|
article.config.browser_user_agent = headers['User-Agent']
|
|
article.config.headers = headers
|
|
article.download()
|
|
article.parse()
|
|
|
|
article.source_url = urlparse(url).netloc.replace('www.', '').title()
|
|
return article
|
|
|
|
except Exception as e:
|
|
raise Exception(f"Failed to parse article from {url}: {str(e)}")
|
|
|
|
def clean_text(text: str) -> str:
|
|
# Convert paragraph breaks to a special marker
|
|
text = re.sub(r'\n\s*\n', '¶¶', text)
|
|
# Convert all other whitespace (including single newlines) to single spaces
|
|
text = re.sub(r'\s+', ' ', text)
|
|
# Convert markers back to double newlines
|
|
text = text.replace('¶¶', '\n\n')
|
|
return text.strip()
|
|
|
|
def format_article_markdown(article) -> str:
|
|
# Format title
|
|
output = f"# {article.title}\n\n"
|
|
|
|
# Format metadata
|
|
if article.authors:
|
|
authors = article.authors if isinstance(article.authors, list) else [article.authors]
|
|
output += f"*By {', '.join(filter(None, authors))}*\n\n"
|
|
|
|
if article.publish_date:
|
|
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
|
|
output += f"*Published: {date_str}*\n\n"
|
|
|
|
if article.top_image:
|
|
output += f"\n\n"
|
|
|
|
# Format article text with improved paragraph handling
|
|
if article.text:
|
|
clean_content = clean_text(article.text)
|
|
output += clean_content
|
|
|
|
return output
|
|
|
|
async def main():
|
|
if len(sys.argv) != 2:
|
|
print("Usage: ./n3k <article_url>")
|
|
sys.exit(1)
|
|
|
|
url = sys.argv[1]
|
|
try:
|
|
article = await fetch_and_parse_article(url)
|
|
formatted_content = format_article_markdown(article)
|
|
print(formatted_content)
|
|
except Exception as e:
|
|
print(f"Error processing article: {str(e)}")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|
|
|