Auto-update: Tue Feb 11 14:31:09 PST 2025
This commit is contained in:
parent
34b8a95623
commit
758b8ef136
1 changed files with 81 additions and 30 deletions
111
n3k
111
n3k
|
@ -1,50 +1,101 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
import trafilatura
|
||||
from newspaper import Article
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
import math
|
||||
from typing import Optional
|
||||
import textwrap
|
||||
|
||||
def format_article(url):
|
||||
async def fetch_and_parse_article(url: str):
|
||||
# Try trafilatura first
|
||||
source = trafilatura.fetch_url(url)
|
||||
|
||||
if source:
|
||||
try:
|
||||
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
|
||||
|
||||
article = Article(url)
|
||||
article.set_html(source)
|
||||
article.parse()
|
||||
|
||||
# Update article properties with trafilatura data
|
||||
article.title = article.title or traf.title or url
|
||||
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
|
||||
article.publish_date = traf.date or datetime.now()
|
||||
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
|
||||
article.top_image = article.top_image or traf.image
|
||||
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
|
||||
|
||||
return article
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to newspaper3k
|
||||
try:
|
||||
# Initialize and download the article
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
}
|
||||
|
||||
article = Article(url)
|
||||
article.config.browser_user_agent = headers['User-Agent']
|
||||
article.config.headers = headers
|
||||
article.download()
|
||||
article.parse()
|
||||
|
||||
# Format the title
|
||||
output = f"# {article.title}\n\n"
|
||||
|
||||
# Format the authors
|
||||
if article.authors:
|
||||
output += f"*{', '.join(article.authors)}*\n\n"
|
||||
|
||||
# Format the text with proper wrapping and paragraph separation
|
||||
if article.text:
|
||||
# Split into paragraphs and wrap each one
|
||||
paragraphs = article.text.split('\n')
|
||||
wrapped_paragraphs = []
|
||||
|
||||
for paragraph in paragraphs:
|
||||
if paragraph.strip(): # Only process non-empty paragraphs
|
||||
# Wrap text at 80 characters
|
||||
wrapped = textwrap.fill(paragraph.strip(), width=80)
|
||||
wrapped_paragraphs.append(wrapped)
|
||||
|
||||
output += '\n\n'.join(wrapped_paragraphs)
|
||||
|
||||
return output
|
||||
article.source_url = urlparse(url).netloc.replace('www.', '').title()
|
||||
return article
|
||||
|
||||
except Exception as e:
|
||||
return f"Error processing article: {str(e)}"
|
||||
raise Exception(f"Failed to parse article from {url}: {str(e)}")
|
||||
|
||||
def main():
|
||||
def format_article_markdown(article) -> str:
|
||||
# Format title
|
||||
output = f"# {article.title}\n\n"
|
||||
|
||||
# Format metadata
|
||||
if article.authors:
|
||||
authors = article.authors if isinstance(article.authors, list) else [article.authors]
|
||||
output += f"*By {', '.join(filter(None, authors))}*\n\n"
|
||||
|
||||
if article.publish_date:
|
||||
date_str = article.publish_date.strftime("%Y-%m-%d") if isinstance(article.publish_date, datetime) else str(article.publish_date)
|
||||
output += f"*Published: {date_str}*\n\n"
|
||||
|
||||
if article.top_image:
|
||||
output += f"\n\n"
|
||||
|
||||
# Format article text with proper wrapping
|
||||
if article.text:
|
||||
paragraphs = article.text.split('\n')
|
||||
wrapped_paragraphs = []
|
||||
|
||||
for paragraph in paragraphs:
|
||||
if paragraph.strip():
|
||||
wrapped = textwrap.fill(paragraph.strip(), width=80)
|
||||
wrapped_paragraphs.append(wrapped)
|
||||
|
||||
output += '\n\n'.join(wrapped_paragraphs)
|
||||
|
||||
return output
|
||||
|
||||
async def main():
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: ./script.py <article_url>")
|
||||
print("Usage: ./n3k <article_url>")
|
||||
sys.exit(1)
|
||||
|
||||
url = sys.argv[1]
|
||||
formatted_article = format_article(url)
|
||||
print(formatted_article)
|
||||
try:
|
||||
article = await fetch_and_parse_article(url)
|
||||
formatted_content = format_article_markdown(article)
|
||||
print(formatted_content)
|
||||
except Exception as e:
|
||||
print(f"Error processing article: {str(e)}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
asyncio.run(main())
|
||||
|
|
Loading…
Add table
Reference in a new issue