Auto-update: Sat Jun 29 17:18:50 PDT 2024

This commit is contained in:
sanj 2024-06-29 17:18:50 -07:00
parent 565a576c48
commit ad0ae30575
3 changed files with 560 additions and 549 deletions

View file

@ -24,6 +24,7 @@ MODULES:
ig: off ig: off
llm: on llm: on
loc: on loc: on
news: on
note: on note: on
rag: off rag: off
sd: on sd: on

558
sijapi/routers/news.py Normal file
View file

@ -0,0 +1,558 @@
from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
from fastapi.responses import JSONResponse
from zoneinfo import ZoneInfo
from io import BytesIO
from pydantic import BaseModel
from bs4 import BeautifulSoup
import requests
from markdownify import markdownify as md
import os
import mimetypes
from datetime import datetime as dt_datetime
import shutil
import uuid
import aiohttp
from pathlib import Path
from urllib.parse import urlparse
from urllib3.util.retry import Retry
from typing import Optional
import newspaper
from newspaper import Article
import trafilatura
from readability import Document
from requests.adapters import HTTPAdapter
from sijapi import API, L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path
from sijapi.routers import llm, tts, asr, loc
news = APIRouter()
### CLIPPER ###
@news.post("/clip")
async def clip_post(
bg_tasks: BackgroundTasks,
url: Optional[str] = Form(None),
source: Optional[str] = Form(None),
title: Optional[str] = Form(None),
tts: str = Form('summary'),
voice: str = Form(DEFAULT_VOICE),
encoding: str = Form('utf-8')
):
markdown_filename = await process_article(bg_tasks, url, title, encoding, source, tts, voice)
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@news.post("/archive")
async def archive_post(
url: Optional[str] = Form(None),
source: Optional[str] = Form(None),
title: Optional[str] = Form(None),
encoding: str = Form('utf-8')
):
markdown_filename = await process_archive(url, title, encoding, source)
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@news.get("/clip")
async def clip_get(
bg_tasks: BackgroundTasks,
url: str,
title: Optional[str] = Query(None),
encoding: str = Query('utf-8'),
tts: str = Query('summary'),
voice: str = Query(DEFAULT_VOICE)
):
markdown_filename = await process_article(bg_tasks, url, title, encoding, tts=tts, voice=voice)
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@news.post("/note/add")
async def note_add_endpoint(file: Optional[UploadFile] = File(None), text: Optional[str] = Form(None), source: Optional[str] = Form(None), bg_tasks: BackgroundTasks = None):
L.DEBUG(f"Received request on /note/add...")
if not file and not text:
L.WARN(f"... without any file or text!")
raise HTTPException(status_code=400, detail="Either text or a file must be provided")
else:
result = await process_for_daily_note(file, text, source, bg_tasks)
L.INFO(f"Result on /note/add: {result}")
return JSONResponse(result, status_code=204)
async def process_for_daily_note(file: Optional[UploadFile] = File(None), text: Optional[str] = None, source: Optional[str] = None, bg_tasks: BackgroundTasks = None):
now = dt_datetime.now()
transcription_entry = ""
file_entry = ""
if file:
L.DEBUG("File received...")
file_content = await file.read()
audio_io = BytesIO(file_content)
# Improve error handling for file type guessing
guessed_type = mimetypes.guess_type(file.filename)
file_type = guessed_type[0] if guessed_type[0] else "application/octet-stream"
L.DEBUG(f"Processing as {file_type}...")
# Extract the main type (e.g., 'audio', 'image', 'video')
main_type = file_type.split('/')[0]
subdir = main_type.title() if main_type else "Documents"
absolute_path, relative_path = assemble_journal_path(now, subdir=subdir, filename=file.filename)
L.DEBUG(f"Destination path: {absolute_path}")
with open(absolute_path, 'wb') as f:
f.write(file_content)
L.DEBUG(f"Processing {f.name}...")
if main_type == 'audio':
transcription = await asr.transcribe_audio(file_path=absolute_path, params=asr.TranscribeParams(model="small-en", language="en", threads=6))
file_entry = f"![[{relative_path}]]"
elif main_type == 'image':
file_entry = f"![[{relative_path}]]"
else:
file_entry = f"[Source]({relative_path})"
text_entry = text if text else ""
L.DEBUG(f"transcription: {transcription_entry}\nfile_entry: {file_entry}\ntext_entry: {text_entry}")
return await add_to_daily_note(transcription_entry, file_entry, text_entry, now)
async def add_to_daily_note(transcription: str = None, file_link: str = None, additional_text: str = None, date_time: dt_datetime = None):
date_time = date_time or dt_datetime.now()
note_path, _ = assemble_journal_path(date_time, filename='Notes', extension=".md", no_timestamp = True)
time_str = date_time.strftime("%H:%M")
entry_lines = []
if additional_text and additional_text.strip():
entry_lines.append(f"\t* {additional_text.strip()}")
if transcription and transcription.strip():
entry_lines.append(f"\t* {transcription.strip()}")
if file_link and file_link.strip():
entry_lines.append(f"\t\t {file_link.strip()}")
entry = f"\n* **{time_str}**\n" + "\n".join(entry_lines)
# Write the entry to the end of the file
if note_path.exists():
with open(note_path, 'a', encoding='utf-8') as note_file:
note_file.write(entry)
else:
date_str = date_time.strftime("%Y-%m-%d")
frontmatter = f"""---
date: {date_str}
tags:
- notes
---
"""
content = frontmatter + entry
# If the file doesn't exist, create it and start with "Notes"
with open(note_path, 'w', encoding='utf-8') as note_file:
note_file.write(content)
return entry
async def process_document(
bg_tasks: BackgroundTasks,
document: File,
title: Optional[str] = None,
tts_mode: str = "summary",
voice: str = DEFAULT_VOICE
):
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
# Save the document to OBSIDIAN_RESOURCES_DIR
document_content = await document.read()
file_path = Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR / document.filename
with open(file_path, 'wb') as f:
f.write(document_content)
parsed_content = await llm.extract_text(file_path) # Ensure extract_text is awaited
llm_title, summary = await llm.title_and_summary(parsed_content)
try:
readable_title = sanitize_filename(title if title else document.filename)
if tts_mode == "full" or tts_mode == "content" or tts_mode == "body":
tts_text = parsed_content
elif tts_mode == "summary" or tts_mode == "excerpt":
tts_text = summary
else:
tts_text = None
frontmatter = f"""---
title: {readable_title}
added: {timestamp}
---
"""
body = f"# {readable_title}\n\n"
if tts_text:
try:
datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
audio_filename = f"{datetime_str} {readable_title}"
audio_path = await tts.generate_speech(
bg_tasks=bg_tasks,
text=tts_text,
voice=voice,
model="eleven_turbo_v2",
podcast=True,
title=audio_filename,
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR
)
audio_ext = Path(audio_path).suffix
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
body += f"{obsidian_link}\n\n"
except Exception as e:
L.ERR(f"Failed in the TTS portion of clipping: {e}")
body += f"> [!summary]+\n"
body += f"> {summary}\n\n"
body += parsed_content
markdown_content = frontmatter + body
markdown_filename = f"{readable_title}.md"
encoding = 'utf-8'
with open(markdown_filename, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_filename}")
return markdown_filename
except Exception as e:
L.ERR(f"Failed to clip: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
async def process_article(
bg_tasks: BackgroundTasks,
parsed_content: Article,
tts_mode: str = "summary",
voice: str = DEFAULT_11L_VOICE
):
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = sanitize_filename(parsed_content.title or timestamp)
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
try:
summary = await llm.summarize_text(parsed_content.clean_doc, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
summary = summary.replace('\n', ' ') # Remove line breaks
if tts_mode == "full" or tts_mode == "content":
tts_text = parsed_content.clean_doc
elif tts_mode == "summary" or tts_mode == "excerpt":
tts_text = summary
else:
tts_text = None
banner_markdown = ''
try:
banner_url = parsed_content.top_image
if banner_url != '':
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
if banner_image:
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
except Exception as e:
L.ERR(f"No image found in article")
authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.authors)
published_date = parsed_content.publish_date
frontmatter = f"""---
title: {readable_title}
authors: {authors}
published: {published_date}
added: {timestamp}
banner: "{banner_markdown}"
tags:
"""
frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.tags)
frontmatter += '\n---\n'
body = f"# {readable_title}\n\n"
if tts_text:
audio_filename = f"{published_date} {readable_title}"
try:
audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
audio_ext = Path(audio_path).suffix
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
body += f"{obsidian_link}\n\n"
except Exception as e:
L.ERR(f"Failed to generate TTS for np3k. {e}")
try:
body += f"by {authors} in {parsed_content.canonical_link}" # update with method for getting the newspaper name
body += f"> [!summary]+\n"
body += f"> {summary}\n\n"
body += parsed_content["content"]
markdown_content = frontmatter + body
except Exception as e:
L.ERR(f"Failed to combine elements of article markdown.")
try:
with open(markdown_filename, 'w') as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_filename}")
add_to_daily_note
return markdown_filename
except Exception as e:
L.ERR(f"Failed to write markdown file")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
L.ERR(f"Failed to clip: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
async def process_article2(
bg_tasks: BackgroundTasks,
url: str,
title: Optional[str] = None,
encoding: str = 'utf-8',
source: Optional[str] = None,
tts_mode: str = "summary",
voice: str = DEFAULT_11L_VOICE
):
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
parsed_content = await parse_article(url, source)
if parsed_content is None:
return {"error": "Failed to retrieve content"}
readable_title = sanitize_filename(title or parsed_content.get("title") or timestamp)
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
try:
summary = await llm.summarize_text(parsed_content["content"], "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
summary = summary.replace('\n', ' ') # Remove line breaks
if tts_mode == "full" or tts_mode == "content":
tts_text = parsed_content["content"]
elif tts_mode == "summary" or tts_mode == "excerpt":
tts_text = summary
else:
tts_text = None
banner_markdown = ''
try:
banner_url = parsed_content.get('image', '')
if banner_url != '':
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
if banner_image:
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
except Exception as e:
L.ERR(f"No image found in article")
authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))
frontmatter = f"""---
title: {readable_title}
authors: {', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))}
published: {parsed_content.get('date_published', 'Unknown')}
added: {timestamp}
excerpt: {parsed_content.get('excerpt', '')}
banner: "{banner_markdown}"
tags:
"""
frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.get('tags', []))
frontmatter += '\n---\n'
body = f"# {readable_title}\n\n"
if tts_text:
datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
audio_filename = f"{datetime_str} {readable_title}"
try:
audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
audio_ext = Path(audio_path).suffix
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
body += f"{obsidian_link}\n\n"
except Exception as e:
L.ERR(f"Failed to generate TTS for np3k. {e}")
try:
body += f"by {authors} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({url}).\n\n"
body += f"> [!summary]+\n"
body += f"> {summary}\n\n"
body += parsed_content["content"]
markdown_content = frontmatter + body
except Exception as e:
L.ERR(f"Failed to combine elements of article markdown.")
try:
with open(markdown_filename, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_filename}")
add_to_daily_note
return markdown_filename
except Exception as e:
L.ERR(f"Failed to write markdown file")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
L.ERR(f"Failed to clip {url}: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
async def parse_article(url: str, source: Optional[str] = None) -> Article:
source = source if source else trafilatura.fetch_url(url)
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
# Create and parse the newspaper3k Article
article = Article(url)
article.set_html(source)
article.parse()
L.INFO(f"Parsed {article.title}")
# Update or set properties based on trafilatura and additional processing
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = article.publish_date or traf.date
try:
article.publish_date = await loc.dt(article.publish_date, "UTC")
except:
L.DEBUG(f"Failed to localize {article.publish_date}")
article.publish_date = await loc.dt(dt_datetime.now(), "UTC")
article.meta_description = article.meta_description or traf.description
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
article.top_image = article.top_image or traf.image
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
article.meta_keywords = article.meta_keywords or traf.categories or traf.tags
article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords]
# Set additional data in the additional_data dictionary
article.additional_data = {
'excerpt': article.meta_description,
'domain': article.source_url,
'tags': article.meta_keywords,
'content': article.text # Store the markdown content here
}
return article
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
if source:
html_content = source
elif url:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html_content = await response.text()
else:
L.ERR(f"Unable to convert nothing to markdown.")
return None
# Use readability to extract the main content
doc = Document(html_content)
cleaned_html = doc.summary()
# Parse the cleaned HTML with BeautifulSoup for any additional processing
soup = BeautifulSoup(cleaned_html, 'html.parser')
# Remove any remaining unwanted elements
for element in soup(['script', 'style']):
element.decompose()
# Convert to markdown
markdown_content = md(str(soup), heading_style="ATX")
return markdown_content
async def process_archive(
url: str,
title: Optional[str] = None,
encoding: str = 'utf-8',
source: Optional[str] = None,
) -> Path:
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = title if title else f"{url} - {timestamp}"
content = await html_to_markdown(url, source)
if content is None:
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
markdown_content = f"---\n"
markdown_content += f"title: {readable_title}\n"
markdown_content += f"added: {timestamp}\n"
markdown_content += f"url: {url}"
markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n"
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
markdown_content += content
try:
markdown_path.parent.mkdir(parents=True, exist_ok=True)
with open(markdown_path, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
L.DEBUG(f"Successfully saved to {markdown_path}")
return markdown_path
except Exception as e:
L.WARN(f"Failed to write markdown file: {str(e)}")
return None
def download_file(url, folder):
os.makedirs(folder, exist_ok=True)
filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
filepath = os.path.join(folder, filename)
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
try:
response = session.get(url, headers=headers, timeout=10)
if response.status_code == 200:
if 'image' in response.headers.get('Content-Type', ''):
with open(filepath, 'wb') as f:
f.write(response.content)
else:
L.ERR(f"Failed to download image: {url}, invalid content type: {response.headers.get('Content-Type')}")
return None
else:
L.ERR(f"Failed to download image: {url}, status code: {response.status_code}")
return None
except Exception as e:
L.ERR(f"Failed to download image: {url}, error: {str(e)}")
return None
return filename
def copy_file(local_path, folder):
os.makedirs(folder, exist_ok=True)
filename = os.path.basename(local_path)
destination_path = os.path.join(folder, filename)
shutil.copy(local_path, destination_path)
return filename
async def save_file(file: UploadFile, folder: Path) -> Path:
file_path = folder / f"{dt_datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
with open(file_path, 'wb') as f:
shutil.copyfileobj(file.file, f)
return file_path

View file

@ -3,26 +3,9 @@ Manages an Obsidian vault, in particular daily notes, using information and func
''' '''
from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
from fastapi.responses import JSONResponse, PlainTextResponse from fastapi.responses import JSONResponse, PlainTextResponse
from io import BytesIO
from pydantic import BaseModel
import os, re import os, re
import uuid
import aiohttp
import traceback import traceback
import requests
import mimetypes
import shutil
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from typing import Optional, Union, Dict, List, Tuple from typing import Optional, Union, Dict, List, Tuple
from urllib.parse import urlparse
from urllib3.util.retry import Retry
import newspaper
from newspaper import Article
import trafilatura
from readability import Document
from requests.adapters import HTTPAdapter
import re import re
import os import os
from datetime import timedelta, datetime as dt_datetime, time as dt_time, date as dt_date from datetime import timedelta, datetime as dt_datetime, time as dt_time, date as dt_date
@ -35,6 +18,7 @@ from sijapi.routers import cal, loc, tts, llm, time, sd, weather, asr
from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, check_file_name, HOURLY_COLUMNS_MAPPING from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, check_file_name, HOURLY_COLUMNS_MAPPING
from sijapi.classes import Location from sijapi.classes import Location
note = APIRouter() note = APIRouter()
def list_and_correct_impermissible_files(root_dir, rename: bool = False): def list_and_correct_impermissible_files(root_dir, rename: bool = False):
@ -827,535 +811,3 @@ async def update_daily_note_events(date_time: dt_datetime):
### CLIPPER ###
@note.post("/clip")
async def clip_post(
bg_tasks: BackgroundTasks,
url: Optional[str] = Form(None),
source: Optional[str] = Form(None),
title: Optional[str] = Form(None),
tts: str = Form('summary'),
voice: str = Form(DEFAULT_VOICE),
encoding: str = Form('utf-8')
):
markdown_filename = await process_article(bg_tasks, url, title, encoding, source, tts, voice)
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@note.post("/archive")
async def archive_post(
url: Optional[str] = Form(None),
source: Optional[str] = Form(None),
title: Optional[str] = Form(None),
encoding: str = Form('utf-8')
):
markdown_filename = await process_archive(url, title, encoding, source)
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@note.get("/clip")
async def clip_get(
bg_tasks: BackgroundTasks,
url: str,
title: Optional[str] = Query(None),
encoding: str = Query('utf-8'),
tts: str = Query('summary'),
voice: str = Query(DEFAULT_VOICE)
):
markdown_filename = await process_article(bg_tasks, url, title, encoding, tts=tts, voice=voice)
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@note.post("/note/add")
async def note_add_endpoint(file: Optional[UploadFile] = File(None), text: Optional[str] = Form(None), source: Optional[str] = Form(None), bg_tasks: BackgroundTasks = None):
L.DEBUG(f"Received request on /note/add...")
if not file and not text:
L.WARN(f"... without any file or text!")
raise HTTPException(status_code=400, detail="Either text or a file must be provided")
else:
result = await process_for_daily_note(file, text, source, bg_tasks)
L.INFO(f"Result on /note/add: {result}")
return JSONResponse(result, status_code=204)
async def process_for_daily_note(file: Optional[UploadFile] = File(None), text: Optional[str] = None, source: Optional[str] = None, bg_tasks: BackgroundTasks = None):
now = dt_datetime.now()
transcription_entry = ""
file_entry = ""
if file:
L.DEBUG("File received...")
file_content = await file.read()
audio_io = BytesIO(file_content)
# Improve error handling for file type guessing
guessed_type = mimetypes.guess_type(file.filename)
file_type = guessed_type[0] if guessed_type[0] else "application/octet-stream"
L.DEBUG(f"Processing as {file_type}...")
# Extract the main type (e.g., 'audio', 'image', 'video')
main_type = file_type.split('/')[0]
subdir = main_type.title() if main_type else "Documents"
absolute_path, relative_path = assemble_journal_path(now, subdir=subdir, filename=file.filename)
L.DEBUG(f"Destination path: {absolute_path}")
with open(absolute_path, 'wb') as f:
f.write(file_content)
L.DEBUG(f"Processing {f.name}...")
if main_type == 'audio':
transcription = await asr.transcribe_audio(file_path=absolute_path, params=asr.TranscribeParams(model="small-en", language="en", threads=6))
file_entry = f"![[{relative_path}]]"
elif main_type == 'image':
file_entry = f"![[{relative_path}]]"
else:
file_entry = f"[Source]({relative_path})"
text_entry = text if text else ""
L.DEBUG(f"transcription: {transcription_entry}\nfile_entry: {file_entry}\ntext_entry: {text_entry}")
return await add_to_daily_note(transcription_entry, file_entry, text_entry, now)
async def add_to_daily_note(transcription: str = None, file_link: str = None, additional_text: str = None, date_time: dt_datetime = None):
date_time = date_time or dt_datetime.now()
note_path, _ = assemble_journal_path(date_time, filename='Notes', extension=".md", no_timestamp = True)
time_str = date_time.strftime("%H:%M")
entry_lines = []
if additional_text and additional_text.strip():
entry_lines.append(f"\t* {additional_text.strip()}")
if transcription and transcription.strip():
entry_lines.append(f"\t* {transcription.strip()}")
if file_link and file_link.strip():
entry_lines.append(f"\t\t {file_link.strip()}")
entry = f"\n* **{time_str}**\n" + "\n".join(entry_lines)
# Write the entry to the end of the file
if note_path.exists():
with open(note_path, 'a', encoding='utf-8') as note_file:
note_file.write(entry)
else:
date_str = date_time.strftime("%Y-%m-%d")
frontmatter = f"""---
date: {date_str}
tags:
- notes
---
"""
content = frontmatter + entry
# If the file doesn't exist, create it and start with "Notes"
with open(note_path, 'w', encoding='utf-8') as note_file:
note_file.write(content)
return entry
async def process_document(
bg_tasks: BackgroundTasks,
document: File,
title: Optional[str] = None,
tts_mode: str = "summary",
voice: str = DEFAULT_VOICE
):
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
# Save the document to OBSIDIAN_RESOURCES_DIR
document_content = await document.read()
file_path = Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR / document.filename
with open(file_path, 'wb') as f:
f.write(document_content)
parsed_content = await llm.extract_text(file_path) # Ensure extract_text is awaited
llm_title, summary = await llm.title_and_summary(parsed_content)
try:
readable_title = sanitize_filename(title if title else document.filename)
if tts_mode == "full" or tts_mode == "content" or tts_mode == "body":
tts_text = parsed_content
elif tts_mode == "summary" or tts_mode == "excerpt":
tts_text = summary
else:
tts_text = None
frontmatter = f"""---
title: {readable_title}
added: {timestamp}
---
"""
body = f"# {readable_title}\n\n"
if tts_text:
try:
datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
audio_filename = f"{datetime_str} {readable_title}"
audio_path = await tts.generate_speech(
bg_tasks=bg_tasks,
text=tts_text,
voice=voice,
model="eleven_turbo_v2",
podcast=True,
title=audio_filename,
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR
)
audio_ext = Path(audio_path).suffix
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
body += f"{obsidian_link}\n\n"
except Exception as e:
L.ERR(f"Failed in the TTS portion of clipping: {e}")
body += f"> [!summary]+\n"
body += f"> {summary}\n\n"
body += parsed_content
markdown_content = frontmatter + body
markdown_filename = f"{readable_title}.md"
encoding = 'utf-8'
with open(markdown_filename, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_filename}")
return markdown_filename
except Exception as e:
L.ERR(f"Failed to clip: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
async def process_article(
bg_tasks: BackgroundTasks,
parsed_content: Article,
tts_mode: str = "summary",
voice: str = DEFAULT_11L_VOICE
):
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = sanitize_filename(parsed_content.title or timestamp)
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
try:
summary = await llm.summarize_text(parsed_content.clean_doc, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
summary = summary.replace('\n', ' ') # Remove line breaks
if tts_mode == "full" or tts_mode == "content":
tts_text = parsed_content.clean_doc
elif tts_mode == "summary" or tts_mode == "excerpt":
tts_text = summary
else:
tts_text = None
banner_markdown = ''
try:
banner_url = parsed_content.top_image
if banner_url != '':
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
if banner_image:
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
except Exception as e:
L.ERR(f"No image found in article")
authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.authors)
published_date = parsed_content.publish_date
frontmatter = f"""---
title: {readable_title}
authors: {authors}
published: {published_date}
added: {timestamp}
banner: "{banner_markdown}"
tags:
"""
frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.tags)
frontmatter += '\n---\n'
body = f"# {readable_title}\n\n"
if tts_text:
audio_filename = f"{published_date} {readable_title}"
try:
audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
audio_ext = Path(audio_path).suffix
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
body += f"{obsidian_link}\n\n"
except Exception as e:
L.ERR(f"Failed to generate TTS for np3k. {e}")
try:
body += f"by {authors} in {parsed_content.canonical_link}" # update with method for getting the newspaper name
body += f"> [!summary]+\n"
body += f"> {summary}\n\n"
body += parsed_content["content"]
markdown_content = frontmatter + body
except Exception as e:
L.ERR(f"Failed to combine elements of article markdown.")
try:
with open(markdown_filename, 'w') as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_filename}")
add_to_daily_note
return markdown_filename
except Exception as e:
L.ERR(f"Failed to write markdown file")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
L.ERR(f"Failed to clip: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
async def process_article2(
bg_tasks: BackgroundTasks,
url: str,
title: Optional[str] = None,
encoding: str = 'utf-8',
source: Optional[str] = None,
tts_mode: str = "summary",
voice: str = DEFAULT_11L_VOICE
):
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
parsed_content = await parse_article(url, source)
if parsed_content is None:
return {"error": "Failed to retrieve content"}
readable_title = sanitize_filename(title or parsed_content.get("title") or timestamp)
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
try:
summary = await llm.summarize_text(parsed_content["content"], "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
summary = summary.replace('\n', ' ') # Remove line breaks
if tts_mode == "full" or tts_mode == "content":
tts_text = parsed_content["content"]
elif tts_mode == "summary" or tts_mode == "excerpt":
tts_text = summary
else:
tts_text = None
banner_markdown = ''
try:
banner_url = parsed_content.get('image', '')
if banner_url != '':
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
if banner_image:
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
except Exception as e:
L.ERR(f"No image found in article")
authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))
frontmatter = f"""---
title: {readable_title}
authors: {', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))}
published: {parsed_content.get('date_published', 'Unknown')}
added: {timestamp}
excerpt: {parsed_content.get('excerpt', '')}
banner: "{banner_markdown}"
tags:
"""
frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.get('tags', []))
frontmatter += '\n---\n'
body = f"# {readable_title}\n\n"
if tts_text:
datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
audio_filename = f"{datetime_str} {readable_title}"
try:
audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
audio_ext = Path(audio_path).suffix
obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
body += f"{obsidian_link}\n\n"
except Exception as e:
L.ERR(f"Failed to generate TTS for np3k. {e}")
try:
body += f"by {authors} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({url}).\n\n"
body += f"> [!summary]+\n"
body += f"> {summary}\n\n"
body += parsed_content["content"]
markdown_content = frontmatter + body
except Exception as e:
L.ERR(f"Failed to combine elements of article markdown.")
try:
with open(markdown_filename, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_filename}")
add_to_daily_note
return markdown_filename
except Exception as e:
L.ERR(f"Failed to write markdown file")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
L.ERR(f"Failed to clip {url}: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
async def parse_article(url: str, source: Optional[str] = None) -> Article:
source = source if source else trafilatura.fetch_url(url)
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
# Create and parse the newspaper3k Article
article = Article(url)
article.set_html(source)
article.parse()
L.INFO(f"Parsed {article.title}")
# Update or set properties based on trafilatura and additional processing
article.title = article.title or traf.title or url
article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
article.publish_date = article.publish_date or traf.date
try:
article.publish_date = await loc.dt(article.publish_date, "UTC")
except:
L.DEBUG(f"Failed to localize {article.publish_date}")
article.publish_date = await loc.dt(dt_datetime.now(), "UTC")
article.meta_description = article.meta_description or traf.description
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
article.top_image = article.top_image or traf.image
article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
article.meta_keywords = article.meta_keywords or traf.categories or traf.tags
article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords]
# Set additional data in the additional_data dictionary
article.additional_data = {
'excerpt': article.meta_description,
'domain': article.source_url,
'tags': article.meta_keywords,
'content': article.text # Store the markdown content here
}
return article
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
if source:
html_content = source
elif url:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html_content = await response.text()
else:
L.ERR(f"Unable to convert nothing to markdown.")
return None
# Use readability to extract the main content
doc = Document(html_content)
cleaned_html = doc.summary()
# Parse the cleaned HTML with BeautifulSoup for any additional processing
soup = BeautifulSoup(cleaned_html, 'html.parser')
# Remove any remaining unwanted elements
for element in soup(['script', 'style']):
element.decompose()
# Convert to markdown
markdown_content = md(str(soup), heading_style="ATX")
return markdown_content
async def process_archive(
url: str,
title: Optional[str] = None,
encoding: str = 'utf-8',
source: Optional[str] = None,
) -> Path:
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = title if title else f"{url} - {timestamp}"
content = await html_to_markdown(url, source)
if content is None:
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
markdown_content = f"---\n"
markdown_content += f"title: {readable_title}\n"
markdown_content += f"added: {timestamp}\n"
markdown_content += f"url: {url}"
markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n"
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
markdown_content += content
try:
markdown_path.parent.mkdir(parents=True, exist_ok=True)
with open(markdown_path, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
L.DEBUG(f"Successfully saved to {markdown_path}")
return markdown_path
except Exception as e:
L.WARN(f"Failed to write markdown file: {str(e)}")
return None
def download_file(url, folder):
os.makedirs(folder, exist_ok=True)
filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
filepath = os.path.join(folder, filename)
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
try:
response = session.get(url, headers=headers, timeout=10)
if response.status_code == 200:
if 'image' in response.headers.get('Content-Type', ''):
with open(filepath, 'wb') as f:
f.write(response.content)
else:
L.ERR(f"Failed to download image: {url}, invalid content type: {response.headers.get('Content-Type')}")
return None
else:
L.ERR(f"Failed to download image: {url}, status code: {response.status_code}")
return None
except Exception as e:
L.ERR(f"Failed to download image: {url}, error: {str(e)}")
return None
return filename
def copy_file(local_path, folder):
os.makedirs(folder, exist_ok=True)
filename = os.path.basename(local_path)
destination_path = os.path.join(folder, filename)
shutil.copy(local_path, destination_path)
return filename
async def save_file(file: UploadFile, folder: Path) -> Path:
file_path = folder / f"{dt_datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
with open(file_path, 'wb') as f:
shutil.copyfileobj(file.file, f)
return file_path