Auto-update: Sat Jun 29 17:18:50 PDT 2024
This commit is contained in:
parent
565a576c48
commit
ad0ae30575
3 changed files with 560 additions and 549 deletions
|
@ -24,6 +24,7 @@ MODULES:
|
|||
ig: off
|
||||
llm: on
|
||||
loc: on
|
||||
news: on
|
||||
note: on
|
||||
rag: off
|
||||
sd: on
|
||||
|
|
558
sijapi/routers/news.py
Normal file
558
sijapi/routers/news.py
Normal file
|
@ -0,0 +1,558 @@
|
|||
from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
|
||||
from fastapi.responses import JSONResponse
|
||||
from zoneinfo import ZoneInfo
|
||||
from io import BytesIO
|
||||
from pydantic import BaseModel
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from markdownify import markdownify as md
|
||||
import os
|
||||
import mimetypes
|
||||
from datetime import datetime as dt_datetime
|
||||
import shutil
|
||||
import uuid
|
||||
import aiohttp
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from urllib3.util.retry import Retry
|
||||
from typing import Optional
|
||||
import newspaper
|
||||
from newspaper import Article
|
||||
import trafilatura
|
||||
from readability import Document
|
||||
from requests.adapters import HTTPAdapter
|
||||
from sijapi import API, L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO
|
||||
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path
|
||||
from sijapi.routers import llm, tts, asr, loc
|
||||
|
||||
news = APIRouter()
|
||||
|
||||
### CLIPPER ###
|
||||
@news.post("/clip")
|
||||
async def clip_post(
|
||||
bg_tasks: BackgroundTasks,
|
||||
url: Optional[str] = Form(None),
|
||||
source: Optional[str] = Form(None),
|
||||
title: Optional[str] = Form(None),
|
||||
tts: str = Form('summary'),
|
||||
voice: str = Form(DEFAULT_VOICE),
|
||||
encoding: str = Form('utf-8')
|
||||
):
|
||||
markdown_filename = await process_article(bg_tasks, url, title, encoding, source, tts, voice)
|
||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||
|
||||
@news.post("/archive")
|
||||
async def archive_post(
|
||||
url: Optional[str] = Form(None),
|
||||
source: Optional[str] = Form(None),
|
||||
title: Optional[str] = Form(None),
|
||||
encoding: str = Form('utf-8')
|
||||
):
|
||||
markdown_filename = await process_archive(url, title, encoding, source)
|
||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||
|
||||
@news.get("/clip")
|
||||
async def clip_get(
|
||||
bg_tasks: BackgroundTasks,
|
||||
url: str,
|
||||
title: Optional[str] = Query(None),
|
||||
encoding: str = Query('utf-8'),
|
||||
tts: str = Query('summary'),
|
||||
voice: str = Query(DEFAULT_VOICE)
|
||||
):
|
||||
markdown_filename = await process_article(bg_tasks, url, title, encoding, tts=tts, voice=voice)
|
||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||
|
||||
@news.post("/note/add")
|
||||
async def note_add_endpoint(file: Optional[UploadFile] = File(None), text: Optional[str] = Form(None), source: Optional[str] = Form(None), bg_tasks: BackgroundTasks = None):
|
||||
L.DEBUG(f"Received request on /note/add...")
|
||||
if not file and not text:
|
||||
L.WARN(f"... without any file or text!")
|
||||
raise HTTPException(status_code=400, detail="Either text or a file must be provided")
|
||||
else:
|
||||
result = await process_for_daily_note(file, text, source, bg_tasks)
|
||||
L.INFO(f"Result on /note/add: {result}")
|
||||
return JSONResponse(result, status_code=204)
|
||||
|
||||
async def process_for_daily_note(file: Optional[UploadFile] = File(None), text: Optional[str] = None, source: Optional[str] = None, bg_tasks: BackgroundTasks = None):
|
||||
now = dt_datetime.now()
|
||||
transcription_entry = ""
|
||||
file_entry = ""
|
||||
if file:
|
||||
L.DEBUG("File received...")
|
||||
file_content = await file.read()
|
||||
audio_io = BytesIO(file_content)
|
||||
|
||||
# Improve error handling for file type guessing
|
||||
guessed_type = mimetypes.guess_type(file.filename)
|
||||
file_type = guessed_type[0] if guessed_type[0] else "application/octet-stream"
|
||||
|
||||
L.DEBUG(f"Processing as {file_type}...")
|
||||
|
||||
# Extract the main type (e.g., 'audio', 'image', 'video')
|
||||
main_type = file_type.split('/')[0]
|
||||
subdir = main_type.title() if main_type else "Documents"
|
||||
|
||||
absolute_path, relative_path = assemble_journal_path(now, subdir=subdir, filename=file.filename)
|
||||
L.DEBUG(f"Destination path: {absolute_path}")
|
||||
|
||||
with open(absolute_path, 'wb') as f:
|
||||
f.write(file_content)
|
||||
L.DEBUG(f"Processing {f.name}...")
|
||||
|
||||
if main_type == 'audio':
|
||||
transcription = await asr.transcribe_audio(file_path=absolute_path, params=asr.TranscribeParams(model="small-en", language="en", threads=6))
|
||||
file_entry = f"![[{relative_path}]]"
|
||||
elif main_type == 'image':
|
||||
file_entry = f"![[{relative_path}]]"
|
||||
else:
|
||||
file_entry = f"[Source]({relative_path})"
|
||||
|
||||
text_entry = text if text else ""
|
||||
L.DEBUG(f"transcription: {transcription_entry}\nfile_entry: {file_entry}\ntext_entry: {text_entry}")
|
||||
return await add_to_daily_note(transcription_entry, file_entry, text_entry, now)
|
||||
|
||||
async def add_to_daily_note(transcription: str = None, file_link: str = None, additional_text: str = None, date_time: dt_datetime = None):
|
||||
date_time = date_time or dt_datetime.now()
|
||||
note_path, _ = assemble_journal_path(date_time, filename='Notes', extension=".md", no_timestamp = True)
|
||||
time_str = date_time.strftime("%H:%M")
|
||||
|
||||
entry_lines = []
|
||||
if additional_text and additional_text.strip():
|
||||
entry_lines.append(f"\t* {additional_text.strip()}")
|
||||
if transcription and transcription.strip():
|
||||
entry_lines.append(f"\t* {transcription.strip()}")
|
||||
if file_link and file_link.strip():
|
||||
entry_lines.append(f"\t\t {file_link.strip()}")
|
||||
|
||||
entry = f"\n* **{time_str}**\n" + "\n".join(entry_lines)
|
||||
|
||||
# Write the entry to the end of the file
|
||||
if note_path.exists():
|
||||
with open(note_path, 'a', encoding='utf-8') as note_file:
|
||||
note_file.write(entry)
|
||||
else:
|
||||
date_str = date_time.strftime("%Y-%m-%d")
|
||||
frontmatter = f"""---
|
||||
date: {date_str}
|
||||
tags:
|
||||
- notes
|
||||
---
|
||||
|
||||
"""
|
||||
content = frontmatter + entry
|
||||
# If the file doesn't exist, create it and start with "Notes"
|
||||
with open(note_path, 'w', encoding='utf-8') as note_file:
|
||||
note_file.write(content)
|
||||
|
||||
return entry
|
||||
|
||||
|
||||
|
||||
async def process_document(
|
||||
bg_tasks: BackgroundTasks,
|
||||
document: File,
|
||||
title: Optional[str] = None,
|
||||
tts_mode: str = "summary",
|
||||
voice: str = DEFAULT_VOICE
|
||||
):
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
|
||||
# Save the document to OBSIDIAN_RESOURCES_DIR
|
||||
document_content = await document.read()
|
||||
file_path = Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR / document.filename
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(document_content)
|
||||
|
||||
parsed_content = await llm.extract_text(file_path) # Ensure extract_text is awaited
|
||||
|
||||
llm_title, summary = await llm.title_and_summary(parsed_content)
|
||||
try:
|
||||
readable_title = sanitize_filename(title if title else document.filename)
|
||||
|
||||
if tts_mode == "full" or tts_mode == "content" or tts_mode == "body":
|
||||
tts_text = parsed_content
|
||||
elif tts_mode == "summary" or tts_mode == "excerpt":
|
||||
tts_text = summary
|
||||
else:
|
||||
tts_text = None
|
||||
|
||||
frontmatter = f"""---
|
||||
title: {readable_title}
|
||||
added: {timestamp}
|
||||
---
|
||||
"""
|
||||
body = f"# {readable_title}\n\n"
|
||||
|
||||
if tts_text:
|
||||
try:
|
||||
datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
audio_filename = f"{datetime_str} {readable_title}"
|
||||
audio_path = await tts.generate_speech(
|
||||
bg_tasks=bg_tasks,
|
||||
text=tts_text,
|
||||
voice=voice,
|
||||
model="eleven_turbo_v2",
|
||||
podcast=True,
|
||||
title=audio_filename,
|
||||
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR
|
||||
)
|
||||
audio_ext = Path(audio_path).suffix
|
||||
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
|
||||
body += f"{obsidian_link}\n\n"
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed in the TTS portion of clipping: {e}")
|
||||
|
||||
body += f"> [!summary]+\n"
|
||||
body += f"> {summary}\n\n"
|
||||
body += parsed_content
|
||||
markdown_content = frontmatter + body
|
||||
|
||||
markdown_filename = f"{readable_title}.md"
|
||||
encoding = 'utf-8'
|
||||
|
||||
with open(markdown_filename, 'w', encoding=encoding) as md_file:
|
||||
md_file.write(markdown_content)
|
||||
|
||||
L.INFO(f"Successfully saved to {markdown_filename}")
|
||||
|
||||
return markdown_filename
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to clip: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
|
||||
async def process_article(
|
||||
bg_tasks: BackgroundTasks,
|
||||
parsed_content: Article,
|
||||
tts_mode: str = "summary",
|
||||
voice: str = DEFAULT_11L_VOICE
|
||||
):
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
|
||||
readable_title = sanitize_filename(parsed_content.title or timestamp)
|
||||
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
|
||||
|
||||
try:
|
||||
summary = await llm.summarize_text(parsed_content.clean_doc, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
|
||||
summary = summary.replace('\n', ' ') # Remove line breaks
|
||||
|
||||
if tts_mode == "full" or tts_mode == "content":
|
||||
tts_text = parsed_content.clean_doc
|
||||
elif tts_mode == "summary" or tts_mode == "excerpt":
|
||||
tts_text = summary
|
||||
else:
|
||||
tts_text = None
|
||||
|
||||
banner_markdown = ''
|
||||
try:
|
||||
banner_url = parsed_content.top_image
|
||||
if banner_url != '':
|
||||
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
|
||||
if banner_image:
|
||||
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"No image found in article")
|
||||
|
||||
authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.authors)
|
||||
published_date = parsed_content.publish_date
|
||||
frontmatter = f"""---
|
||||
title: {readable_title}
|
||||
authors: {authors}
|
||||
published: {published_date}
|
||||
added: {timestamp}
|
||||
banner: "{banner_markdown}"
|
||||
tags:
|
||||
|
||||
"""
|
||||
frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.tags)
|
||||
frontmatter += '\n---\n'
|
||||
|
||||
body = f"# {readable_title}\n\n"
|
||||
if tts_text:
|
||||
audio_filename = f"{published_date} {readable_title}"
|
||||
try:
|
||||
audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
|
||||
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
|
||||
audio_ext = Path(audio_path).suffix
|
||||
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
|
||||
body += f"{obsidian_link}\n\n"
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to generate TTS for np3k. {e}")
|
||||
|
||||
try:
|
||||
body += f"by {authors} in {parsed_content.canonical_link}" # update with method for getting the newspaper name
|
||||
body += f"> [!summary]+\n"
|
||||
body += f"> {summary}\n\n"
|
||||
body += parsed_content["content"]
|
||||
markdown_content = frontmatter + body
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to combine elements of article markdown.")
|
||||
|
||||
try:
|
||||
with open(markdown_filename, 'w') as md_file:
|
||||
md_file.write(markdown_content)
|
||||
|
||||
L.INFO(f"Successfully saved to {markdown_filename}")
|
||||
add_to_daily_note
|
||||
return markdown_filename
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to write markdown file")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to clip: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
async def process_article2(
|
||||
bg_tasks: BackgroundTasks,
|
||||
url: str,
|
||||
title: Optional[str] = None,
|
||||
encoding: str = 'utf-8',
|
||||
source: Optional[str] = None,
|
||||
tts_mode: str = "summary",
|
||||
voice: str = DEFAULT_11L_VOICE
|
||||
):
|
||||
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
|
||||
parsed_content = await parse_article(url, source)
|
||||
if parsed_content is None:
|
||||
return {"error": "Failed to retrieve content"}
|
||||
|
||||
readable_title = sanitize_filename(title or parsed_content.get("title") or timestamp)
|
||||
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
|
||||
|
||||
try:
|
||||
summary = await llm.summarize_text(parsed_content["content"], "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
|
||||
summary = summary.replace('\n', ' ') # Remove line breaks
|
||||
|
||||
if tts_mode == "full" or tts_mode == "content":
|
||||
tts_text = parsed_content["content"]
|
||||
elif tts_mode == "summary" or tts_mode == "excerpt":
|
||||
tts_text = summary
|
||||
else:
|
||||
tts_text = None
|
||||
|
||||
banner_markdown = ''
|
||||
try:
|
||||
banner_url = parsed_content.get('image', '')
|
||||
if banner_url != '':
|
||||
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
|
||||
if banner_image:
|
||||
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"No image found in article")
|
||||
|
||||
authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))
|
||||
|
||||
frontmatter = f"""---
|
||||
title: {readable_title}
|
||||
authors: {', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))}
|
||||
published: {parsed_content.get('date_published', 'Unknown')}
|
||||
added: {timestamp}
|
||||
excerpt: {parsed_content.get('excerpt', '')}
|
||||
banner: "{banner_markdown}"
|
||||
tags:
|
||||
|
||||
"""
|
||||
frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.get('tags', []))
|
||||
frontmatter += '\n---\n'
|
||||
|
||||
body = f"# {readable_title}\n\n"
|
||||
|
||||
if tts_text:
|
||||
datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
audio_filename = f"{datetime_str} {readable_title}"
|
||||
try:
|
||||
audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
|
||||
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
|
||||
audio_ext = Path(audio_path).suffix
|
||||
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
|
||||
body += f"{obsidian_link}\n\n"
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to generate TTS for np3k. {e}")
|
||||
|
||||
try:
|
||||
body += f"by {authors} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({url}).\n\n"
|
||||
body += f"> [!summary]+\n"
|
||||
body += f"> {summary}\n\n"
|
||||
body += parsed_content["content"]
|
||||
markdown_content = frontmatter + body
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to combine elements of article markdown.")
|
||||
|
||||
try:
|
||||
with open(markdown_filename, 'w', encoding=encoding) as md_file:
|
||||
md_file.write(markdown_content)
|
||||
|
||||
L.INFO(f"Successfully saved to {markdown_filename}")
|
||||
add_to_daily_note
|
||||
return markdown_filename
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to write markdown file")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to clip {url}: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
|
||||
async def parse_article(url: str, source: Optional[str] = None) -> Article:
|
||||
source = source if source else trafilatura.fetch_url(url)
|
||||
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
|
||||
|
||||
# Create and parse the newspaper3k Article
|
||||
article = Article(url)
|
||||
article.set_html(source)
|
||||
article.parse()
|
||||
|
||||
L.INFO(f"Parsed {article.title}")
|
||||
|
||||
# Update or set properties based on trafilatura and additional processing
|
||||
article.title = article.title or traf.title or url
|
||||
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
|
||||
|
||||
article.publish_date = article.publish_date or traf.date
|
||||
try:
|
||||
article.publish_date = await loc.dt(article.publish_date, "UTC")
|
||||
except:
|
||||
L.DEBUG(f"Failed to localize {article.publish_date}")
|
||||
article.publish_date = await loc.dt(dt_datetime.now(), "UTC")
|
||||
|
||||
article.meta_description = article.meta_description or traf.description
|
||||
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
|
||||
article.top_image = article.top_image or traf.image
|
||||
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
|
||||
article.meta_keywords = article.meta_keywords or traf.categories or traf.tags
|
||||
article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords]
|
||||
|
||||
# Set additional data in the additional_data dictionary
|
||||
article.additional_data = {
|
||||
'excerpt': article.meta_description,
|
||||
'domain': article.source_url,
|
||||
'tags': article.meta_keywords,
|
||||
'content': article.text # Store the markdown content here
|
||||
}
|
||||
|
||||
return article
|
||||
|
||||
|
||||
|
||||
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
|
||||
if source:
|
||||
html_content = source
|
||||
elif url:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
html_content = await response.text()
|
||||
else:
|
||||
L.ERR(f"Unable to convert nothing to markdown.")
|
||||
return None
|
||||
|
||||
# Use readability to extract the main content
|
||||
doc = Document(html_content)
|
||||
cleaned_html = doc.summary()
|
||||
|
||||
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
||||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||
|
||||
# Remove any remaining unwanted elements
|
||||
for element in soup(['script', 'style']):
|
||||
element.decompose()
|
||||
|
||||
# Convert to markdown
|
||||
markdown_content = md(str(soup), heading_style="ATX")
|
||||
|
||||
return markdown_content
|
||||
|
||||
|
||||
async def process_archive(
|
||||
url: str,
|
||||
title: Optional[str] = None,
|
||||
encoding: str = 'utf-8',
|
||||
source: Optional[str] = None,
|
||||
) -> Path:
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
readable_title = title if title else f"{url} - {timestamp}"
|
||||
|
||||
content = await html_to_markdown(url, source)
|
||||
if content is None:
|
||||
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
|
||||
|
||||
markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
|
||||
|
||||
markdown_content = f"---\n"
|
||||
markdown_content += f"title: {readable_title}\n"
|
||||
markdown_content += f"added: {timestamp}\n"
|
||||
markdown_content += f"url: {url}"
|
||||
markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
|
||||
markdown_content += f"---\n\n"
|
||||
markdown_content += f"# {readable_title}\n\n"
|
||||
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
|
||||
markdown_content += content
|
||||
|
||||
try:
|
||||
markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(markdown_path, 'w', encoding=encoding) as md_file:
|
||||
md_file.write(markdown_content)
|
||||
L.DEBUG(f"Successfully saved to {markdown_path}")
|
||||
return markdown_path
|
||||
except Exception as e:
|
||||
L.WARN(f"Failed to write markdown file: {str(e)}")
|
||||
return None
|
||||
|
||||
def download_file(url, folder):
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
|
||||
filepath = os.path.join(folder, filename)
|
||||
|
||||
session = requests.Session()
|
||||
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
|
||||
session.mount('http://', HTTPAdapter(max_retries=retries))
|
||||
session.mount('https://', HTTPAdapter(max_retries=retries))
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||||
}
|
||||
|
||||
try:
|
||||
response = session.get(url, headers=headers, timeout=10)
|
||||
if response.status_code == 200:
|
||||
if 'image' in response.headers.get('Content-Type', ''):
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(response.content)
|
||||
else:
|
||||
L.ERR(f"Failed to download image: {url}, invalid content type: {response.headers.get('Content-Type')}")
|
||||
return None
|
||||
else:
|
||||
L.ERR(f"Failed to download image: {url}, status code: {response.status_code}")
|
||||
return None
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to download image: {url}, error: {str(e)}")
|
||||
return None
|
||||
return filename
|
||||
|
||||
def copy_file(local_path, folder):
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
filename = os.path.basename(local_path)
|
||||
destination_path = os.path.join(folder, filename)
|
||||
shutil.copy(local_path, destination_path)
|
||||
return filename
|
||||
|
||||
|
||||
async def save_file(file: UploadFile, folder: Path) -> Path:
|
||||
file_path = folder / f"{dt_datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
|
||||
with open(file_path, 'wb') as f:
|
||||
shutil.copyfileobj(file.file, f)
|
||||
return file_path
|
|
@ -3,26 +3,9 @@ Manages an Obsidian vault, in particular daily notes, using information and func
|
|||
'''
|
||||
from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
|
||||
from fastapi.responses import JSONResponse, PlainTextResponse
|
||||
from io import BytesIO
|
||||
from pydantic import BaseModel
|
||||
import os, re
|
||||
import uuid
|
||||
import aiohttp
|
||||
import traceback
|
||||
import requests
|
||||
import mimetypes
|
||||
import shutil
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
from markdownify import markdownify as md
|
||||
from typing import Optional, Union, Dict, List, Tuple
|
||||
from urllib.parse import urlparse
|
||||
from urllib3.util.retry import Retry
|
||||
import newspaper
|
||||
from newspaper import Article
|
||||
import trafilatura
|
||||
from readability import Document
|
||||
from requests.adapters import HTTPAdapter
|
||||
import re
|
||||
import os
|
||||
from datetime import timedelta, datetime as dt_datetime, time as dt_time, date as dt_date
|
||||
|
@ -35,6 +18,7 @@ from sijapi.routers import cal, loc, tts, llm, time, sd, weather, asr
|
|||
from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, check_file_name, HOURLY_COLUMNS_MAPPING
|
||||
from sijapi.classes import Location
|
||||
|
||||
|
||||
note = APIRouter()
|
||||
|
||||
def list_and_correct_impermissible_files(root_dir, rename: bool = False):
|
||||
|
@ -827,535 +811,3 @@ async def update_daily_note_events(date_time: dt_datetime):
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
### CLIPPER ###
|
||||
@note.post("/clip")
|
||||
async def clip_post(
|
||||
bg_tasks: BackgroundTasks,
|
||||
url: Optional[str] = Form(None),
|
||||
source: Optional[str] = Form(None),
|
||||
title: Optional[str] = Form(None),
|
||||
tts: str = Form('summary'),
|
||||
voice: str = Form(DEFAULT_VOICE),
|
||||
encoding: str = Form('utf-8')
|
||||
):
|
||||
markdown_filename = await process_article(bg_tasks, url, title, encoding, source, tts, voice)
|
||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||
|
||||
@note.post("/archive")
|
||||
async def archive_post(
|
||||
url: Optional[str] = Form(None),
|
||||
source: Optional[str] = Form(None),
|
||||
title: Optional[str] = Form(None),
|
||||
encoding: str = Form('utf-8')
|
||||
):
|
||||
markdown_filename = await process_archive(url, title, encoding, source)
|
||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||
|
||||
@note.get("/clip")
|
||||
async def clip_get(
|
||||
bg_tasks: BackgroundTasks,
|
||||
url: str,
|
||||
title: Optional[str] = Query(None),
|
||||
encoding: str = Query('utf-8'),
|
||||
tts: str = Query('summary'),
|
||||
voice: str = Query(DEFAULT_VOICE)
|
||||
):
|
||||
markdown_filename = await process_article(bg_tasks, url, title, encoding, tts=tts, voice=voice)
|
||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||
|
||||
@note.post("/note/add")
|
||||
async def note_add_endpoint(file: Optional[UploadFile] = File(None), text: Optional[str] = Form(None), source: Optional[str] = Form(None), bg_tasks: BackgroundTasks = None):
|
||||
L.DEBUG(f"Received request on /note/add...")
|
||||
if not file and not text:
|
||||
L.WARN(f"... without any file or text!")
|
||||
raise HTTPException(status_code=400, detail="Either text or a file must be provided")
|
||||
else:
|
||||
result = await process_for_daily_note(file, text, source, bg_tasks)
|
||||
L.INFO(f"Result on /note/add: {result}")
|
||||
return JSONResponse(result, status_code=204)
|
||||
|
||||
async def process_for_daily_note(file: Optional[UploadFile] = File(None), text: Optional[str] = None, source: Optional[str] = None, bg_tasks: BackgroundTasks = None):
|
||||
now = dt_datetime.now()
|
||||
transcription_entry = ""
|
||||
file_entry = ""
|
||||
if file:
|
||||
L.DEBUG("File received...")
|
||||
file_content = await file.read()
|
||||
audio_io = BytesIO(file_content)
|
||||
|
||||
# Improve error handling for file type guessing
|
||||
guessed_type = mimetypes.guess_type(file.filename)
|
||||
file_type = guessed_type[0] if guessed_type[0] else "application/octet-stream"
|
||||
|
||||
L.DEBUG(f"Processing as {file_type}...")
|
||||
|
||||
# Extract the main type (e.g., 'audio', 'image', 'video')
|
||||
main_type = file_type.split('/')[0]
|
||||
subdir = main_type.title() if main_type else "Documents"
|
||||
|
||||
absolute_path, relative_path = assemble_journal_path(now, subdir=subdir, filename=file.filename)
|
||||
L.DEBUG(f"Destination path: {absolute_path}")
|
||||
|
||||
with open(absolute_path, 'wb') as f:
|
||||
f.write(file_content)
|
||||
L.DEBUG(f"Processing {f.name}...")
|
||||
|
||||
if main_type == 'audio':
|
||||
transcription = await asr.transcribe_audio(file_path=absolute_path, params=asr.TranscribeParams(model="small-en", language="en", threads=6))
|
||||
file_entry = f"![[{relative_path}]]"
|
||||
elif main_type == 'image':
|
||||
file_entry = f"![[{relative_path}]]"
|
||||
else:
|
||||
file_entry = f"[Source]({relative_path})"
|
||||
|
||||
text_entry = text if text else ""
|
||||
L.DEBUG(f"transcription: {transcription_entry}\nfile_entry: {file_entry}\ntext_entry: {text_entry}")
|
||||
return await add_to_daily_note(transcription_entry, file_entry, text_entry, now)
|
||||
|
||||
async def add_to_daily_note(transcription: str = None, file_link: str = None, additional_text: str = None, date_time: dt_datetime = None):
|
||||
date_time = date_time or dt_datetime.now()
|
||||
note_path, _ = assemble_journal_path(date_time, filename='Notes', extension=".md", no_timestamp = True)
|
||||
time_str = date_time.strftime("%H:%M")
|
||||
|
||||
entry_lines = []
|
||||
if additional_text and additional_text.strip():
|
||||
entry_lines.append(f"\t* {additional_text.strip()}")
|
||||
if transcription and transcription.strip():
|
||||
entry_lines.append(f"\t* {transcription.strip()}")
|
||||
if file_link and file_link.strip():
|
||||
entry_lines.append(f"\t\t {file_link.strip()}")
|
||||
|
||||
entry = f"\n* **{time_str}**\n" + "\n".join(entry_lines)
|
||||
|
||||
# Write the entry to the end of the file
|
||||
if note_path.exists():
|
||||
with open(note_path, 'a', encoding='utf-8') as note_file:
|
||||
note_file.write(entry)
|
||||
else:
|
||||
date_str = date_time.strftime("%Y-%m-%d")
|
||||
frontmatter = f"""---
|
||||
date: {date_str}
|
||||
tags:
|
||||
- notes
|
||||
---
|
||||
|
||||
"""
|
||||
content = frontmatter + entry
|
||||
# If the file doesn't exist, create it and start with "Notes"
|
||||
with open(note_path, 'w', encoding='utf-8') as note_file:
|
||||
note_file.write(content)
|
||||
|
||||
return entry
|
||||
|
||||
|
||||
|
||||
async def process_document(
|
||||
bg_tasks: BackgroundTasks,
|
||||
document: File,
|
||||
title: Optional[str] = None,
|
||||
tts_mode: str = "summary",
|
||||
voice: str = DEFAULT_VOICE
|
||||
):
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
|
||||
# Save the document to OBSIDIAN_RESOURCES_DIR
|
||||
document_content = await document.read()
|
||||
file_path = Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR / document.filename
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(document_content)
|
||||
|
||||
parsed_content = await llm.extract_text(file_path) # Ensure extract_text is awaited
|
||||
|
||||
llm_title, summary = await llm.title_and_summary(parsed_content)
|
||||
try:
|
||||
readable_title = sanitize_filename(title if title else document.filename)
|
||||
|
||||
if tts_mode == "full" or tts_mode == "content" or tts_mode == "body":
|
||||
tts_text = parsed_content
|
||||
elif tts_mode == "summary" or tts_mode == "excerpt":
|
||||
tts_text = summary
|
||||
else:
|
||||
tts_text = None
|
||||
|
||||
frontmatter = f"""---
|
||||
title: {readable_title}
|
||||
added: {timestamp}
|
||||
---
|
||||
"""
|
||||
body = f"# {readable_title}\n\n"
|
||||
|
||||
if tts_text:
|
||||
try:
|
||||
datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
audio_filename = f"{datetime_str} {readable_title}"
|
||||
audio_path = await tts.generate_speech(
|
||||
bg_tasks=bg_tasks,
|
||||
text=tts_text,
|
||||
voice=voice,
|
||||
model="eleven_turbo_v2",
|
||||
podcast=True,
|
||||
title=audio_filename,
|
||||
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR
|
||||
)
|
||||
audio_ext = Path(audio_path).suffix
|
||||
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
|
||||
body += f"{obsidian_link}\n\n"
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed in the TTS portion of clipping: {e}")
|
||||
|
||||
body += f"> [!summary]+\n"
|
||||
body += f"> {summary}\n\n"
|
||||
body += parsed_content
|
||||
markdown_content = frontmatter + body
|
||||
|
||||
markdown_filename = f"{readable_title}.md"
|
||||
encoding = 'utf-8'
|
||||
|
||||
with open(markdown_filename, 'w', encoding=encoding) as md_file:
|
||||
md_file.write(markdown_content)
|
||||
|
||||
L.INFO(f"Successfully saved to {markdown_filename}")
|
||||
|
||||
return markdown_filename
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to clip: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
|
||||
async def process_article(
|
||||
bg_tasks: BackgroundTasks,
|
||||
parsed_content: Article,
|
||||
tts_mode: str = "summary",
|
||||
voice: str = DEFAULT_11L_VOICE
|
||||
):
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
|
||||
readable_title = sanitize_filename(parsed_content.title or timestamp)
|
||||
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
|
||||
|
||||
try:
|
||||
summary = await llm.summarize_text(parsed_content.clean_doc, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
|
||||
summary = summary.replace('\n', ' ') # Remove line breaks
|
||||
|
||||
if tts_mode == "full" or tts_mode == "content":
|
||||
tts_text = parsed_content.clean_doc
|
||||
elif tts_mode == "summary" or tts_mode == "excerpt":
|
||||
tts_text = summary
|
||||
else:
|
||||
tts_text = None
|
||||
|
||||
banner_markdown = ''
|
||||
try:
|
||||
banner_url = parsed_content.top_image
|
||||
if banner_url != '':
|
||||
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
|
||||
if banner_image:
|
||||
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"No image found in article")
|
||||
|
||||
authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.authors)
|
||||
published_date = parsed_content.publish_date
|
||||
frontmatter = f"""---
|
||||
title: {readable_title}
|
||||
authors: {authors}
|
||||
published: {published_date}
|
||||
added: {timestamp}
|
||||
banner: "{banner_markdown}"
|
||||
tags:
|
||||
|
||||
"""
|
||||
frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.tags)
|
||||
frontmatter += '\n---\n'
|
||||
|
||||
body = f"# {readable_title}\n\n"
|
||||
if tts_text:
|
||||
audio_filename = f"{published_date} {readable_title}"
|
||||
try:
|
||||
audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
|
||||
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
|
||||
audio_ext = Path(audio_path).suffix
|
||||
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
|
||||
body += f"{obsidian_link}\n\n"
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to generate TTS for np3k. {e}")
|
||||
|
||||
try:
|
||||
body += f"by {authors} in {parsed_content.canonical_link}" # update with method for getting the newspaper name
|
||||
body += f"> [!summary]+\n"
|
||||
body += f"> {summary}\n\n"
|
||||
body += parsed_content["content"]
|
||||
markdown_content = frontmatter + body
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to combine elements of article markdown.")
|
||||
|
||||
try:
|
||||
with open(markdown_filename, 'w') as md_file:
|
||||
md_file.write(markdown_content)
|
||||
|
||||
L.INFO(f"Successfully saved to {markdown_filename}")
|
||||
add_to_daily_note
|
||||
return markdown_filename
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to write markdown file")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to clip: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
async def process_article2(
|
||||
bg_tasks: BackgroundTasks,
|
||||
url: str,
|
||||
title: Optional[str] = None,
|
||||
encoding: str = 'utf-8',
|
||||
source: Optional[str] = None,
|
||||
tts_mode: str = "summary",
|
||||
voice: str = DEFAULT_11L_VOICE
|
||||
):
|
||||
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
|
||||
parsed_content = await parse_article(url, source)
|
||||
if parsed_content is None:
|
||||
return {"error": "Failed to retrieve content"}
|
||||
|
||||
readable_title = sanitize_filename(title or parsed_content.get("title") or timestamp)
|
||||
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
|
||||
|
||||
try:
|
||||
summary = await llm.summarize_text(parsed_content["content"], "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
|
||||
summary = summary.replace('\n', ' ') # Remove line breaks
|
||||
|
||||
if tts_mode == "full" or tts_mode == "content":
|
||||
tts_text = parsed_content["content"]
|
||||
elif tts_mode == "summary" or tts_mode == "excerpt":
|
||||
tts_text = summary
|
||||
else:
|
||||
tts_text = None
|
||||
|
||||
banner_markdown = ''
|
||||
try:
|
||||
banner_url = parsed_content.get('image', '')
|
||||
if banner_url != '':
|
||||
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
|
||||
if banner_image:
|
||||
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"No image found in article")
|
||||
|
||||
authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))
|
||||
|
||||
frontmatter = f"""---
|
||||
title: {readable_title}
|
||||
authors: {', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))}
|
||||
published: {parsed_content.get('date_published', 'Unknown')}
|
||||
added: {timestamp}
|
||||
excerpt: {parsed_content.get('excerpt', '')}
|
||||
banner: "{banner_markdown}"
|
||||
tags:
|
||||
|
||||
"""
|
||||
frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.get('tags', []))
|
||||
frontmatter += '\n---\n'
|
||||
|
||||
body = f"# {readable_title}\n\n"
|
||||
|
||||
if tts_text:
|
||||
datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
audio_filename = f"{datetime_str} {readable_title}"
|
||||
try:
|
||||
audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
|
||||
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
|
||||
audio_ext = Path(audio_path).suffix
|
||||
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
|
||||
body += f"{obsidian_link}\n\n"
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to generate TTS for np3k. {e}")
|
||||
|
||||
try:
|
||||
body += f"by {authors} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({url}).\n\n"
|
||||
body += f"> [!summary]+\n"
|
||||
body += f"> {summary}\n\n"
|
||||
body += parsed_content["content"]
|
||||
markdown_content = frontmatter + body
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to combine elements of article markdown.")
|
||||
|
||||
try:
|
||||
with open(markdown_filename, 'w', encoding=encoding) as md_file:
|
||||
md_file.write(markdown_content)
|
||||
|
||||
L.INFO(f"Successfully saved to {markdown_filename}")
|
||||
add_to_daily_note
|
||||
return markdown_filename
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to write markdown file")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to clip {url}: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
|
||||
async def parse_article(url: str, source: Optional[str] = None) -> Article:
|
||||
source = source if source else trafilatura.fetch_url(url)
|
||||
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
|
||||
|
||||
# Create and parse the newspaper3k Article
|
||||
article = Article(url)
|
||||
article.set_html(source)
|
||||
article.parse()
|
||||
|
||||
L.INFO(f"Parsed {article.title}")
|
||||
|
||||
# Update or set properties based on trafilatura and additional processing
|
||||
article.title = article.title or traf.title or url
|
||||
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
|
||||
|
||||
article.publish_date = article.publish_date or traf.date
|
||||
try:
|
||||
article.publish_date = await loc.dt(article.publish_date, "UTC")
|
||||
except:
|
||||
L.DEBUG(f"Failed to localize {article.publish_date}")
|
||||
article.publish_date = await loc.dt(dt_datetime.now(), "UTC")
|
||||
|
||||
article.meta_description = article.meta_description or traf.description
|
||||
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
|
||||
article.top_image = article.top_image or traf.image
|
||||
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
|
||||
article.meta_keywords = article.meta_keywords or traf.categories or traf.tags
|
||||
article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords]
|
||||
|
||||
# Set additional data in the additional_data dictionary
|
||||
article.additional_data = {
|
||||
'excerpt': article.meta_description,
|
||||
'domain': article.source_url,
|
||||
'tags': article.meta_keywords,
|
||||
'content': article.text # Store the markdown content here
|
||||
}
|
||||
|
||||
return article
|
||||
|
||||
|
||||
|
||||
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
|
||||
if source:
|
||||
html_content = source
|
||||
elif url:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
html_content = await response.text()
|
||||
else:
|
||||
L.ERR(f"Unable to convert nothing to markdown.")
|
||||
return None
|
||||
|
||||
# Use readability to extract the main content
|
||||
doc = Document(html_content)
|
||||
cleaned_html = doc.summary()
|
||||
|
||||
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
||||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||
|
||||
# Remove any remaining unwanted elements
|
||||
for element in soup(['script', 'style']):
|
||||
element.decompose()
|
||||
|
||||
# Convert to markdown
|
||||
markdown_content = md(str(soup), heading_style="ATX")
|
||||
|
||||
return markdown_content
|
||||
|
||||
|
||||
async def process_archive(
|
||||
url: str,
|
||||
title: Optional[str] = None,
|
||||
encoding: str = 'utf-8',
|
||||
source: Optional[str] = None,
|
||||
) -> Path:
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
readable_title = title if title else f"{url} - {timestamp}"
|
||||
|
||||
content = await html_to_markdown(url, source)
|
||||
if content is None:
|
||||
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
|
||||
|
||||
markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
|
||||
|
||||
markdown_content = f"---\n"
|
||||
markdown_content += f"title: {readable_title}\n"
|
||||
markdown_content += f"added: {timestamp}\n"
|
||||
markdown_content += f"url: {url}"
|
||||
markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
|
||||
markdown_content += f"---\n\n"
|
||||
markdown_content += f"# {readable_title}\n\n"
|
||||
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
|
||||
markdown_content += content
|
||||
|
||||
try:
|
||||
markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(markdown_path, 'w', encoding=encoding) as md_file:
|
||||
md_file.write(markdown_content)
|
||||
L.DEBUG(f"Successfully saved to {markdown_path}")
|
||||
return markdown_path
|
||||
except Exception as e:
|
||||
L.WARN(f"Failed to write markdown file: {str(e)}")
|
||||
return None
|
||||
|
||||
def download_file(url, folder):
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
|
||||
filepath = os.path.join(folder, filename)
|
||||
|
||||
session = requests.Session()
|
||||
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
|
||||
session.mount('http://', HTTPAdapter(max_retries=retries))
|
||||
session.mount('https://', HTTPAdapter(max_retries=retries))
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||||
}
|
||||
|
||||
try:
|
||||
response = session.get(url, headers=headers, timeout=10)
|
||||
if response.status_code == 200:
|
||||
if 'image' in response.headers.get('Content-Type', ''):
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(response.content)
|
||||
else:
|
||||
L.ERR(f"Failed to download image: {url}, invalid content type: {response.headers.get('Content-Type')}")
|
||||
return None
|
||||
else:
|
||||
L.ERR(f"Failed to download image: {url}, status code: {response.status_code}")
|
||||
return None
|
||||
except Exception as e:
|
||||
L.ERR(f"Failed to download image: {url}, error: {str(e)}")
|
||||
return None
|
||||
return filename
|
||||
|
||||
def copy_file(local_path, folder):
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
filename = os.path.basename(local_path)
|
||||
destination_path = os.path.join(folder, filename)
|
||||
shutil.copy(local_path, destination_path)
|
||||
return filename
|
||||
|
||||
|
||||
async def save_file(file: UploadFile, folder: Path) -> Path:
|
||||
file_path = folder / f"{dt_datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
|
||||
with open(file_path, 'wb') as f:
|
||||
shutil.copyfileobj(file.file, f)
|
||||
return file_path
|
||||
|
||||
|
|
Loading…
Reference in a new issue