diff --git a/sijapi/config/api.yaml-example b/sijapi/config/api.yaml-example index 9677ac5..c48598a 100644 --- a/sijapi/config/api.yaml-example +++ b/sijapi/config/api.yaml-example @@ -24,6 +24,7 @@ MODULES: ig: off llm: on loc: on + news: on note: on rag: off sd: on diff --git a/sijapi/routers/news.py b/sijapi/routers/news.py new file mode 100644 index 0000000..096a351 --- /dev/null +++ b/sijapi/routers/news.py @@ -0,0 +1,558 @@ +from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath +from fastapi.responses import JSONResponse +from zoneinfo import ZoneInfo +from io import BytesIO +from pydantic import BaseModel +from bs4 import BeautifulSoup +import requests +from markdownify import markdownify as md +import os +import mimetypes +from datetime import datetime as dt_datetime +import shutil +import uuid +import aiohttp +from pathlib import Path +from urllib.parse import urlparse +from urllib3.util.retry import Retry +from typing import Optional +import newspaper +from newspaper import Article +import trafilatura +from readability import Document +from requests.adapters import HTTPAdapter +from sijapi import API, L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO +from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path +from sijapi.routers import llm, tts, asr, loc + +news = APIRouter() + +### CLIPPER ### +@news.post("/clip") +async def clip_post( + bg_tasks: BackgroundTasks, + url: Optional[str] = Form(None), + source: Optional[str] = Form(None), + title: Optional[str] = Form(None), + tts: str = Form('summary'), + voice: str = Form(DEFAULT_VOICE), + encoding: str = Form('utf-8') +): + markdown_filename = await process_article(bg_tasks, url, title, encoding, source, tts, voice) + return {"message": "Clip saved successfully", "markdown_filename": markdown_filename} + +@news.post("/archive") +async def archive_post( + url: Optional[str] = Form(None), + source: Optional[str] = Form(None), + title: Optional[str] = Form(None), + encoding: str = Form('utf-8') +): + markdown_filename = await process_archive(url, title, encoding, source) + return {"message": "Clip saved successfully", "markdown_filename": markdown_filename} + +@news.get("/clip") +async def clip_get( + bg_tasks: BackgroundTasks, + url: str, + title: Optional[str] = Query(None), + encoding: str = Query('utf-8'), + tts: str = Query('summary'), + voice: str = Query(DEFAULT_VOICE) +): + markdown_filename = await process_article(bg_tasks, url, title, encoding, tts=tts, voice=voice) + return {"message": "Clip saved successfully", "markdown_filename": markdown_filename} + +@news.post("/note/add") +async def note_add_endpoint(file: Optional[UploadFile] = File(None), text: Optional[str] = Form(None), source: Optional[str] = Form(None), bg_tasks: BackgroundTasks = None): + L.DEBUG(f"Received request on /note/add...") + if not file and not text: + L.WARN(f"... without any file or text!") + raise HTTPException(status_code=400, detail="Either text or a file must be provided") + else: + result = await process_for_daily_note(file, text, source, bg_tasks) + L.INFO(f"Result on /note/add: {result}") + return JSONResponse(result, status_code=204) + +async def process_for_daily_note(file: Optional[UploadFile] = File(None), text: Optional[str] = None, source: Optional[str] = None, bg_tasks: BackgroundTasks = None): + now = dt_datetime.now() + transcription_entry = "" + file_entry = "" + if file: + L.DEBUG("File received...") + file_content = await file.read() + audio_io = BytesIO(file_content) + + # Improve error handling for file type guessing + guessed_type = mimetypes.guess_type(file.filename) + file_type = guessed_type[0] if guessed_type[0] else "application/octet-stream" + + L.DEBUG(f"Processing as {file_type}...") + + # Extract the main type (e.g., 'audio', 'image', 'video') + main_type = file_type.split('/')[0] + subdir = main_type.title() if main_type else "Documents" + + absolute_path, relative_path = assemble_journal_path(now, subdir=subdir, filename=file.filename) + L.DEBUG(f"Destination path: {absolute_path}") + + with open(absolute_path, 'wb') as f: + f.write(file_content) + L.DEBUG(f"Processing {f.name}...") + + if main_type == 'audio': + transcription = await asr.transcribe_audio(file_path=absolute_path, params=asr.TranscribeParams(model="small-en", language="en", threads=6)) + file_entry = f"![[{relative_path}]]" + elif main_type == 'image': + file_entry = f"![[{relative_path}]]" + else: + file_entry = f"[Source]({relative_path})" + + text_entry = text if text else "" + L.DEBUG(f"transcription: {transcription_entry}\nfile_entry: {file_entry}\ntext_entry: {text_entry}") + return await add_to_daily_note(transcription_entry, file_entry, text_entry, now) + +async def add_to_daily_note(transcription: str = None, file_link: str = None, additional_text: str = None, date_time: dt_datetime = None): + date_time = date_time or dt_datetime.now() + note_path, _ = assemble_journal_path(date_time, filename='Notes', extension=".md", no_timestamp = True) + time_str = date_time.strftime("%H:%M") + + entry_lines = [] + if additional_text and additional_text.strip(): + entry_lines.append(f"\t* {additional_text.strip()}") + if transcription and transcription.strip(): + entry_lines.append(f"\t* {transcription.strip()}") + if file_link and file_link.strip(): + entry_lines.append(f"\t\t {file_link.strip()}") + + entry = f"\n* **{time_str}**\n" + "\n".join(entry_lines) + + # Write the entry to the end of the file + if note_path.exists(): + with open(note_path, 'a', encoding='utf-8') as note_file: + note_file.write(entry) + else: + date_str = date_time.strftime("%Y-%m-%d") + frontmatter = f"""--- +date: {date_str} +tags: + - notes +--- + +""" + content = frontmatter + entry + # If the file doesn't exist, create it and start with "Notes" + with open(note_path, 'w', encoding='utf-8') as note_file: + note_file.write(content) + + return entry + + + +async def process_document( + bg_tasks: BackgroundTasks, + document: File, + title: Optional[str] = None, + tts_mode: str = "summary", + voice: str = DEFAULT_VOICE +): + timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') + + # Save the document to OBSIDIAN_RESOURCES_DIR + document_content = await document.read() + file_path = Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR / document.filename + with open(file_path, 'wb') as f: + f.write(document_content) + + parsed_content = await llm.extract_text(file_path) # Ensure extract_text is awaited + + llm_title, summary = await llm.title_and_summary(parsed_content) + try: + readable_title = sanitize_filename(title if title else document.filename) + + if tts_mode == "full" or tts_mode == "content" or tts_mode == "body": + tts_text = parsed_content + elif tts_mode == "summary" or tts_mode == "excerpt": + tts_text = summary + else: + tts_text = None + + frontmatter = f"""--- +title: {readable_title} +added: {timestamp} +--- +""" + body = f"# {readable_title}\n\n" + + if tts_text: + try: + datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S") + audio_filename = f"{datetime_str} {readable_title}" + audio_path = await tts.generate_speech( + bg_tasks=bg_tasks, + text=tts_text, + voice=voice, + model="eleven_turbo_v2", + podcast=True, + title=audio_filename, + output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR + ) + audio_ext = Path(audio_path).suffix + obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]" + body += f"{obsidian_link}\n\n" + except Exception as e: + L.ERR(f"Failed in the TTS portion of clipping: {e}") + + body += f"> [!summary]+\n" + body += f"> {summary}\n\n" + body += parsed_content + markdown_content = frontmatter + body + + markdown_filename = f"{readable_title}.md" + encoding = 'utf-8' + + with open(markdown_filename, 'w', encoding=encoding) as md_file: + md_file.write(markdown_content) + + L.INFO(f"Successfully saved to {markdown_filename}") + + return markdown_filename + + except Exception as e: + L.ERR(f"Failed to clip: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + + +async def process_article( + bg_tasks: BackgroundTasks, + parsed_content: Article, + tts_mode: str = "summary", + voice: str = DEFAULT_11L_VOICE +): + timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') + + readable_title = sanitize_filename(parsed_content.title or timestamp) + markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md") + + try: + summary = await llm.summarize_text(parsed_content.clean_doc, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.") + summary = summary.replace('\n', ' ') # Remove line breaks + + if tts_mode == "full" or tts_mode == "content": + tts_text = parsed_content.clean_doc + elif tts_mode == "summary" or tts_mode == "excerpt": + tts_text = summary + else: + tts_text = None + + banner_markdown = '' + try: + banner_url = parsed_content.top_image + if banner_url != '': + banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR)) + if banner_image: + banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]" + + except Exception as e: + L.ERR(f"No image found in article") + + authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.authors) + published_date = parsed_content.publish_date + frontmatter = f"""--- +title: {readable_title} +authors: {authors} +published: {published_date} +added: {timestamp} +banner: "{banner_markdown}" +tags: + +""" + frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.tags) + frontmatter += '\n---\n' + + body = f"# {readable_title}\n\n" + if tts_text: + audio_filename = f"{published_date} {readable_title}" + try: + audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename, + output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR) + audio_ext = Path(audio_path).suffix + obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]" + body += f"{obsidian_link}\n\n" + except Exception as e: + L.ERR(f"Failed to generate TTS for np3k. {e}") + + try: + body += f"by {authors} in {parsed_content.canonical_link}" # update with method for getting the newspaper name + body += f"> [!summary]+\n" + body += f"> {summary}\n\n" + body += parsed_content["content"] + markdown_content = frontmatter + body + + except Exception as e: + L.ERR(f"Failed to combine elements of article markdown.") + + try: + with open(markdown_filename, 'w') as md_file: + md_file.write(markdown_content) + + L.INFO(f"Successfully saved to {markdown_filename}") + add_to_daily_note + return markdown_filename + + except Exception as e: + L.ERR(f"Failed to write markdown file") + raise HTTPException(status_code=500, detail=str(e)) + + except Exception as e: + L.ERR(f"Failed to clip: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +async def process_article2( + bg_tasks: BackgroundTasks, + url: str, + title: Optional[str] = None, + encoding: str = 'utf-8', + source: Optional[str] = None, + tts_mode: str = "summary", + voice: str = DEFAULT_11L_VOICE +): + + timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') + + parsed_content = await parse_article(url, source) + if parsed_content is None: + return {"error": "Failed to retrieve content"} + + readable_title = sanitize_filename(title or parsed_content.get("title") or timestamp) + markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md") + + try: + summary = await llm.summarize_text(parsed_content["content"], "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.") + summary = summary.replace('\n', ' ') # Remove line breaks + + if tts_mode == "full" or tts_mode == "content": + tts_text = parsed_content["content"] + elif tts_mode == "summary" or tts_mode == "excerpt": + tts_text = summary + else: + tts_text = None + + banner_markdown = '' + try: + banner_url = parsed_content.get('image', '') + if banner_url != '': + banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR)) + if banner_image: + banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]" + + except Exception as e: + L.ERR(f"No image found in article") + + authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown'])) + + frontmatter = f"""--- +title: {readable_title} +authors: {', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))} +published: {parsed_content.get('date_published', 'Unknown')} +added: {timestamp} +excerpt: {parsed_content.get('excerpt', '')} +banner: "{banner_markdown}" +tags: + +""" + frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.get('tags', [])) + frontmatter += '\n---\n' + + body = f"# {readable_title}\n\n" + + if tts_text: + datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S") + audio_filename = f"{datetime_str} {readable_title}" + try: + audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename, + output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR) + audio_ext = Path(audio_path).suffix + obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]" + body += f"{obsidian_link}\n\n" + except Exception as e: + L.ERR(f"Failed to generate TTS for np3k. {e}") + + try: + body += f"by {authors} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({url}).\n\n" + body += f"> [!summary]+\n" + body += f"> {summary}\n\n" + body += parsed_content["content"] + markdown_content = frontmatter + body + + except Exception as e: + L.ERR(f"Failed to combine elements of article markdown.") + + try: + with open(markdown_filename, 'w', encoding=encoding) as md_file: + md_file.write(markdown_content) + + L.INFO(f"Successfully saved to {markdown_filename}") + add_to_daily_note + return markdown_filename + + except Exception as e: + L.ERR(f"Failed to write markdown file") + raise HTTPException(status_code=500, detail=str(e)) + + except Exception as e: + L.ERR(f"Failed to clip {url}: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + + +async def parse_article(url: str, source: Optional[str] = None) -> Article: + source = source if source else trafilatura.fetch_url(url) + traf = trafilatura.extract_metadata(filecontent=source, default_url=url) + + # Create and parse the newspaper3k Article + article = Article(url) + article.set_html(source) + article.parse() + + L.INFO(f"Parsed {article.title}") + + # Update or set properties based on trafilatura and additional processing + article.title = article.title or traf.title or url + article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author]) + + article.publish_date = article.publish_date or traf.date + try: + article.publish_date = await loc.dt(article.publish_date, "UTC") + except: + L.DEBUG(f"Failed to localize {article.publish_date}") + article.publish_date = await loc.dt(dt_datetime.now(), "UTC") + + article.meta_description = article.meta_description or traf.description + article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text + article.top_image = article.top_image or traf.image + article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title() + article.meta_keywords = article.meta_keywords or traf.categories or traf.tags + article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords] + + # Set additional data in the additional_data dictionary + article.additional_data = { + 'excerpt': article.meta_description, + 'domain': article.source_url, + 'tags': article.meta_keywords, + 'content': article.text # Store the markdown content here + } + + return article + + + +async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]: + if source: + html_content = source + elif url: + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + html_content = await response.text() + else: + L.ERR(f"Unable to convert nothing to markdown.") + return None + + # Use readability to extract the main content + doc = Document(html_content) + cleaned_html = doc.summary() + + # Parse the cleaned HTML with BeautifulSoup for any additional processing + soup = BeautifulSoup(cleaned_html, 'html.parser') + + # Remove any remaining unwanted elements + for element in soup(['script', 'style']): + element.decompose() + + # Convert to markdown + markdown_content = md(str(soup), heading_style="ATX") + + return markdown_content + + +async def process_archive( + url: str, + title: Optional[str] = None, + encoding: str = 'utf-8', + source: Optional[str] = None, +) -> Path: + timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') + readable_title = title if title else f"{url} - {timestamp}" + + content = await html_to_markdown(url, source) + if content is None: + raise HTTPException(status_code=400, detail="Failed to convert content to markdown") + + markdown_path, relative_path = assemble_archive_path(readable_title, ".md") + + markdown_content = f"---\n" + markdown_content += f"title: {readable_title}\n" + markdown_content += f"added: {timestamp}\n" + markdown_content += f"url: {url}" + markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}" + markdown_content += f"---\n\n" + markdown_content += f"# {readable_title}\n\n" + markdown_content += f"Clipped from [{url}]({url}) on {timestamp}" + markdown_content += content + + try: + markdown_path.parent.mkdir(parents=True, exist_ok=True) + with open(markdown_path, 'w', encoding=encoding) as md_file: + md_file.write(markdown_content) + L.DEBUG(f"Successfully saved to {markdown_path}") + return markdown_path + except Exception as e: + L.WARN(f"Failed to write markdown file: {str(e)}") + return None + +def download_file(url, folder): + os.makedirs(folder, exist_ok=True) + filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1] + filepath = os.path.join(folder, filename) + + session = requests.Session() + retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) + session.mount('http://', HTTPAdapter(max_retries=retries)) + session.mount('https://', HTTPAdapter(max_retries=retries)) + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + + try: + response = session.get(url, headers=headers, timeout=10) + if response.status_code == 200: + if 'image' in response.headers.get('Content-Type', ''): + with open(filepath, 'wb') as f: + f.write(response.content) + else: + L.ERR(f"Failed to download image: {url}, invalid content type: {response.headers.get('Content-Type')}") + return None + else: + L.ERR(f"Failed to download image: {url}, status code: {response.status_code}") + return None + except Exception as e: + L.ERR(f"Failed to download image: {url}, error: {str(e)}") + return None + return filename + +def copy_file(local_path, folder): + os.makedirs(folder, exist_ok=True) + filename = os.path.basename(local_path) + destination_path = os.path.join(folder, filename) + shutil.copy(local_path, destination_path) + return filename + + +async def save_file(file: UploadFile, folder: Path) -> Path: + file_path = folder / f"{dt_datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}" + with open(file_path, 'wb') as f: + shutil.copyfileobj(file.file, f) + return file_path diff --git a/sijapi/routers/note.py b/sijapi/routers/note.py index 4b359e4..887c5a8 100644 --- a/sijapi/routers/note.py +++ b/sijapi/routers/note.py @@ -3,26 +3,9 @@ Manages an Obsidian vault, in particular daily notes, using information and func ''' from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath from fastapi.responses import JSONResponse, PlainTextResponse -from io import BytesIO -from pydantic import BaseModel import os, re -import uuid -import aiohttp import traceback -import requests -import mimetypes -import shutil -from zoneinfo import ZoneInfo -from bs4 import BeautifulSoup -from markdownify import markdownify as md from typing import Optional, Union, Dict, List, Tuple -from urllib.parse import urlparse -from urllib3.util.retry import Retry -import newspaper -from newspaper import Article -import trafilatura -from readability import Document -from requests.adapters import HTTPAdapter import re import os from datetime import timedelta, datetime as dt_datetime, time as dt_time, date as dt_date @@ -35,6 +18,7 @@ from sijapi.routers import cal, loc, tts, llm, time, sd, weather, asr from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, check_file_name, HOURLY_COLUMNS_MAPPING from sijapi.classes import Location + note = APIRouter() def list_and_correct_impermissible_files(root_dir, rename: bool = False): @@ -827,535 +811,3 @@ async def update_daily_note_events(date_time: dt_datetime): - - -### CLIPPER ### -@note.post("/clip") -async def clip_post( - bg_tasks: BackgroundTasks, - url: Optional[str] = Form(None), - source: Optional[str] = Form(None), - title: Optional[str] = Form(None), - tts: str = Form('summary'), - voice: str = Form(DEFAULT_VOICE), - encoding: str = Form('utf-8') -): - markdown_filename = await process_article(bg_tasks, url, title, encoding, source, tts, voice) - return {"message": "Clip saved successfully", "markdown_filename": markdown_filename} - -@note.post("/archive") -async def archive_post( - url: Optional[str] = Form(None), - source: Optional[str] = Form(None), - title: Optional[str] = Form(None), - encoding: str = Form('utf-8') -): - markdown_filename = await process_archive(url, title, encoding, source) - return {"message": "Clip saved successfully", "markdown_filename": markdown_filename} - -@note.get("/clip") -async def clip_get( - bg_tasks: BackgroundTasks, - url: str, - title: Optional[str] = Query(None), - encoding: str = Query('utf-8'), - tts: str = Query('summary'), - voice: str = Query(DEFAULT_VOICE) -): - markdown_filename = await process_article(bg_tasks, url, title, encoding, tts=tts, voice=voice) - return {"message": "Clip saved successfully", "markdown_filename": markdown_filename} - -@note.post("/note/add") -async def note_add_endpoint(file: Optional[UploadFile] = File(None), text: Optional[str] = Form(None), source: Optional[str] = Form(None), bg_tasks: BackgroundTasks = None): - L.DEBUG(f"Received request on /note/add...") - if not file and not text: - L.WARN(f"... without any file or text!") - raise HTTPException(status_code=400, detail="Either text or a file must be provided") - else: - result = await process_for_daily_note(file, text, source, bg_tasks) - L.INFO(f"Result on /note/add: {result}") - return JSONResponse(result, status_code=204) - -async def process_for_daily_note(file: Optional[UploadFile] = File(None), text: Optional[str] = None, source: Optional[str] = None, bg_tasks: BackgroundTasks = None): - now = dt_datetime.now() - transcription_entry = "" - file_entry = "" - if file: - L.DEBUG("File received...") - file_content = await file.read() - audio_io = BytesIO(file_content) - - # Improve error handling for file type guessing - guessed_type = mimetypes.guess_type(file.filename) - file_type = guessed_type[0] if guessed_type[0] else "application/octet-stream" - - L.DEBUG(f"Processing as {file_type}...") - - # Extract the main type (e.g., 'audio', 'image', 'video') - main_type = file_type.split('/')[0] - subdir = main_type.title() if main_type else "Documents" - - absolute_path, relative_path = assemble_journal_path(now, subdir=subdir, filename=file.filename) - L.DEBUG(f"Destination path: {absolute_path}") - - with open(absolute_path, 'wb') as f: - f.write(file_content) - L.DEBUG(f"Processing {f.name}...") - - if main_type == 'audio': - transcription = await asr.transcribe_audio(file_path=absolute_path, params=asr.TranscribeParams(model="small-en", language="en", threads=6)) - file_entry = f"![[{relative_path}]]" - elif main_type == 'image': - file_entry = f"![[{relative_path}]]" - else: - file_entry = f"[Source]({relative_path})" - - text_entry = text if text else "" - L.DEBUG(f"transcription: {transcription_entry}\nfile_entry: {file_entry}\ntext_entry: {text_entry}") - return await add_to_daily_note(transcription_entry, file_entry, text_entry, now) - -async def add_to_daily_note(transcription: str = None, file_link: str = None, additional_text: str = None, date_time: dt_datetime = None): - date_time = date_time or dt_datetime.now() - note_path, _ = assemble_journal_path(date_time, filename='Notes', extension=".md", no_timestamp = True) - time_str = date_time.strftime("%H:%M") - - entry_lines = [] - if additional_text and additional_text.strip(): - entry_lines.append(f"\t* {additional_text.strip()}") - if transcription and transcription.strip(): - entry_lines.append(f"\t* {transcription.strip()}") - if file_link and file_link.strip(): - entry_lines.append(f"\t\t {file_link.strip()}") - - entry = f"\n* **{time_str}**\n" + "\n".join(entry_lines) - - # Write the entry to the end of the file - if note_path.exists(): - with open(note_path, 'a', encoding='utf-8') as note_file: - note_file.write(entry) - else: - date_str = date_time.strftime("%Y-%m-%d") - frontmatter = f"""--- -date: {date_str} -tags: - - notes ---- - -""" - content = frontmatter + entry - # If the file doesn't exist, create it and start with "Notes" - with open(note_path, 'w', encoding='utf-8') as note_file: - note_file.write(content) - - return entry - - - -async def process_document( - bg_tasks: BackgroundTasks, - document: File, - title: Optional[str] = None, - tts_mode: str = "summary", - voice: str = DEFAULT_VOICE -): - timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') - - # Save the document to OBSIDIAN_RESOURCES_DIR - document_content = await document.read() - file_path = Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR / document.filename - with open(file_path, 'wb') as f: - f.write(document_content) - - parsed_content = await llm.extract_text(file_path) # Ensure extract_text is awaited - - llm_title, summary = await llm.title_and_summary(parsed_content) - try: - readable_title = sanitize_filename(title if title else document.filename) - - if tts_mode == "full" or tts_mode == "content" or tts_mode == "body": - tts_text = parsed_content - elif tts_mode == "summary" or tts_mode == "excerpt": - tts_text = summary - else: - tts_text = None - - frontmatter = f"""--- -title: {readable_title} -added: {timestamp} ---- -""" - body = f"# {readable_title}\n\n" - - if tts_text: - try: - datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S") - audio_filename = f"{datetime_str} {readable_title}" - audio_path = await tts.generate_speech( - bg_tasks=bg_tasks, - text=tts_text, - voice=voice, - model="eleven_turbo_v2", - podcast=True, - title=audio_filename, - output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR - ) - audio_ext = Path(audio_path).suffix - obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]" - body += f"{obsidian_link}\n\n" - except Exception as e: - L.ERR(f"Failed in the TTS portion of clipping: {e}") - - body += f"> [!summary]+\n" - body += f"> {summary}\n\n" - body += parsed_content - markdown_content = frontmatter + body - - markdown_filename = f"{readable_title}.md" - encoding = 'utf-8' - - with open(markdown_filename, 'w', encoding=encoding) as md_file: - md_file.write(markdown_content) - - L.INFO(f"Successfully saved to {markdown_filename}") - - return markdown_filename - - except Exception as e: - L.ERR(f"Failed to clip: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - - - -async def process_article( - bg_tasks: BackgroundTasks, - parsed_content: Article, - tts_mode: str = "summary", - voice: str = DEFAULT_11L_VOICE -): - timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') - - readable_title = sanitize_filename(parsed_content.title or timestamp) - markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md") - - try: - summary = await llm.summarize_text(parsed_content.clean_doc, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.") - summary = summary.replace('\n', ' ') # Remove line breaks - - if tts_mode == "full" or tts_mode == "content": - tts_text = parsed_content.clean_doc - elif tts_mode == "summary" or tts_mode == "excerpt": - tts_text = summary - else: - tts_text = None - - banner_markdown = '' - try: - banner_url = parsed_content.top_image - if banner_url != '': - banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR)) - if banner_image: - banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]" - - except Exception as e: - L.ERR(f"No image found in article") - - authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.authors) - published_date = parsed_content.publish_date - frontmatter = f"""--- -title: {readable_title} -authors: {authors} -published: {published_date} -added: {timestamp} -banner: "{banner_markdown}" -tags: - -""" - frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.tags) - frontmatter += '\n---\n' - - body = f"# {readable_title}\n\n" - if tts_text: - audio_filename = f"{published_date} {readable_title}" - try: - audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename, - output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR) - audio_ext = Path(audio_path).suffix - obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]" - body += f"{obsidian_link}\n\n" - except Exception as e: - L.ERR(f"Failed to generate TTS for np3k. {e}") - - try: - body += f"by {authors} in {parsed_content.canonical_link}" # update with method for getting the newspaper name - body += f"> [!summary]+\n" - body += f"> {summary}\n\n" - body += parsed_content["content"] - markdown_content = frontmatter + body - - except Exception as e: - L.ERR(f"Failed to combine elements of article markdown.") - - try: - with open(markdown_filename, 'w') as md_file: - md_file.write(markdown_content) - - L.INFO(f"Successfully saved to {markdown_filename}") - add_to_daily_note - return markdown_filename - - except Exception as e: - L.ERR(f"Failed to write markdown file") - raise HTTPException(status_code=500, detail=str(e)) - - except Exception as e: - L.ERR(f"Failed to clip: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - - -async def process_article2( - bg_tasks: BackgroundTasks, - url: str, - title: Optional[str] = None, - encoding: str = 'utf-8', - source: Optional[str] = None, - tts_mode: str = "summary", - voice: str = DEFAULT_11L_VOICE -): - - timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') - - parsed_content = await parse_article(url, source) - if parsed_content is None: - return {"error": "Failed to retrieve content"} - - readable_title = sanitize_filename(title or parsed_content.get("title") or timestamp) - markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md") - - try: - summary = await llm.summarize_text(parsed_content["content"], "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.") - summary = summary.replace('\n', ' ') # Remove line breaks - - if tts_mode == "full" or tts_mode == "content": - tts_text = parsed_content["content"] - elif tts_mode == "summary" or tts_mode == "excerpt": - tts_text = summary - else: - tts_text = None - - banner_markdown = '' - try: - banner_url = parsed_content.get('image', '') - if banner_url != '': - banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR)) - if banner_image: - banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]" - - except Exception as e: - L.ERR(f"No image found in article") - - authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown'])) - - frontmatter = f"""--- -title: {readable_title} -authors: {', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))} -published: {parsed_content.get('date_published', 'Unknown')} -added: {timestamp} -excerpt: {parsed_content.get('excerpt', '')} -banner: "{banner_markdown}" -tags: - -""" - frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.get('tags', [])) - frontmatter += '\n---\n' - - body = f"# {readable_title}\n\n" - - if tts_text: - datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S") - audio_filename = f"{datetime_str} {readable_title}" - try: - audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename, - output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR) - audio_ext = Path(audio_path).suffix - obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]" - body += f"{obsidian_link}\n\n" - except Exception as e: - L.ERR(f"Failed to generate TTS for np3k. {e}") - - try: - body += f"by {authors} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({url}).\n\n" - body += f"> [!summary]+\n" - body += f"> {summary}\n\n" - body += parsed_content["content"] - markdown_content = frontmatter + body - - except Exception as e: - L.ERR(f"Failed to combine elements of article markdown.") - - try: - with open(markdown_filename, 'w', encoding=encoding) as md_file: - md_file.write(markdown_content) - - L.INFO(f"Successfully saved to {markdown_filename}") - add_to_daily_note - return markdown_filename - - except Exception as e: - L.ERR(f"Failed to write markdown file") - raise HTTPException(status_code=500, detail=str(e)) - - except Exception as e: - L.ERR(f"Failed to clip {url}: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - - - -async def parse_article(url: str, source: Optional[str] = None) -> Article: - source = source if source else trafilatura.fetch_url(url) - traf = trafilatura.extract_metadata(filecontent=source, default_url=url) - - # Create and parse the newspaper3k Article - article = Article(url) - article.set_html(source) - article.parse() - - L.INFO(f"Parsed {article.title}") - - # Update or set properties based on trafilatura and additional processing - article.title = article.title or traf.title or url - article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author]) - - article.publish_date = article.publish_date or traf.date - try: - article.publish_date = await loc.dt(article.publish_date, "UTC") - except: - L.DEBUG(f"Failed to localize {article.publish_date}") - article.publish_date = await loc.dt(dt_datetime.now(), "UTC") - - article.meta_description = article.meta_description or traf.description - article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text - article.top_image = article.top_image or traf.image - article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title() - article.meta_keywords = article.meta_keywords or traf.categories or traf.tags - article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords] - - # Set additional data in the additional_data dictionary - article.additional_data = { - 'excerpt': article.meta_description, - 'domain': article.source_url, - 'tags': article.meta_keywords, - 'content': article.text # Store the markdown content here - } - - return article - - - -async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]: - if source: - html_content = source - elif url: - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - html_content = await response.text() - else: - L.ERR(f"Unable to convert nothing to markdown.") - return None - - # Use readability to extract the main content - doc = Document(html_content) - cleaned_html = doc.summary() - - # Parse the cleaned HTML with BeautifulSoup for any additional processing - soup = BeautifulSoup(cleaned_html, 'html.parser') - - # Remove any remaining unwanted elements - for element in soup(['script', 'style']): - element.decompose() - - # Convert to markdown - markdown_content = md(str(soup), heading_style="ATX") - - return markdown_content - - -async def process_archive( - url: str, - title: Optional[str] = None, - encoding: str = 'utf-8', - source: Optional[str] = None, -) -> Path: - timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') - readable_title = title if title else f"{url} - {timestamp}" - - content = await html_to_markdown(url, source) - if content is None: - raise HTTPException(status_code=400, detail="Failed to convert content to markdown") - - markdown_path, relative_path = assemble_archive_path(readable_title, ".md") - - markdown_content = f"---\n" - markdown_content += f"title: {readable_title}\n" - markdown_content += f"added: {timestamp}\n" - markdown_content += f"url: {url}" - markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}" - markdown_content += f"---\n\n" - markdown_content += f"# {readable_title}\n\n" - markdown_content += f"Clipped from [{url}]({url}) on {timestamp}" - markdown_content += content - - try: - markdown_path.parent.mkdir(parents=True, exist_ok=True) - with open(markdown_path, 'w', encoding=encoding) as md_file: - md_file.write(markdown_content) - L.DEBUG(f"Successfully saved to {markdown_path}") - return markdown_path - except Exception as e: - L.WARN(f"Failed to write markdown file: {str(e)}") - return None - -def download_file(url, folder): - os.makedirs(folder, exist_ok=True) - filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1] - filepath = os.path.join(folder, filename) - - session = requests.Session() - retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) - session.mount('http://', HTTPAdapter(max_retries=retries)) - session.mount('https://', HTTPAdapter(max_retries=retries)) - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' - } - - try: - response = session.get(url, headers=headers, timeout=10) - if response.status_code == 200: - if 'image' in response.headers.get('Content-Type', ''): - with open(filepath, 'wb') as f: - f.write(response.content) - else: - L.ERR(f"Failed to download image: {url}, invalid content type: {response.headers.get('Content-Type')}") - return None - else: - L.ERR(f"Failed to download image: {url}, status code: {response.status_code}") - return None - except Exception as e: - L.ERR(f"Failed to download image: {url}, error: {str(e)}") - return None - return filename - -def copy_file(local_path, folder): - os.makedirs(folder, exist_ok=True) - filename = os.path.basename(local_path) - destination_path = os.path.join(folder, filename) - shutil.copy(local_path, destination_path) - return filename - - -async def save_file(file: UploadFile, folder: Path) -> Path: - file_path = folder / f"{dt_datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}" - with open(file_path, 'wb') as f: - shutil.copyfileobj(file.file, f) - return file_path -