Auto-update: Sat Jun 29 17:18:50 PDT 2024

2024-06-29 17:18:50 -07:00 · 2024-06-29 17:18:50 -07:00 · ad0ae30575
commit ad0ae30575
parent 565a576c48
3 changed files with 560 additions and 549 deletions
--- a/sijapi/config/api.yaml-example
+++ b/sijapi/config/api.yaml-example
@ -24,6 +24,7 @@ MODULES:
  ig: off
  llm: on
  loc: on
  news: on
  note: on
  rag: off
  sd: on
--- a/sijapi/routers/news.py
+++ b/sijapi/routers/news.py
@ -0,0 +1,558 @@
 from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
 from fastapi.responses import JSONResponse
 from zoneinfo import ZoneInfo
 from io import BytesIO
 from pydantic import BaseModel
 from bs4 import BeautifulSoup
 import requests
 from markdownify import markdownify as md
 import os
 import mimetypes
 from datetime import datetime as dt_datetime
 import shutil
 import uuid
 import aiohttp
 from pathlib import Path
 from urllib.parse import urlparse
 from urllib3.util.retry import Retry
 from typing import Optional
 import newspaper
 from newspaper import Article
 import trafilatura
 from readability import Document
 from requests.adapters import HTTPAdapter
 from sijapi import API, L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO
 from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path
 from sijapi.routers import llm, tts, asr, loc
 news = APIRouter()
 ### CLIPPER ###
@news.post("/clip")
 async def clip_post(
    bg_tasks: BackgroundTasks,
    url: Optional[str] = Form(None),
    source: Optional[str] = Form(None),
    title: Optional[str] = Form(None),
    tts: str = Form('summary'),
    voice: str = Form(DEFAULT_VOICE),
    encoding: str = Form('utf-8')
 ):
    markdown_filename = await process_article(bg_tasks, url, title, encoding, source, tts, voice)
    return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@news.post("/archive")
 async def archive_post(
    url: Optional[str] = Form(None),
    source: Optional[str] = Form(None),
    title: Optional[str] = Form(None),
    encoding: str = Form('utf-8')
 ):
    markdown_filename = await process_archive(url, title, encoding, source)
    return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@news.get("/clip")
 async def clip_get(
    bg_tasks: BackgroundTasks,
    url: str,
    title: Optional[str] = Query(None),
    encoding: str = Query('utf-8'),
    tts: str = Query('summary'),
    voice: str = Query(DEFAULT_VOICE)
 ):
    markdown_filename = await process_article(bg_tasks, url, title, encoding, tts=tts, voice=voice)
    return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@news.post("/note/add")
 async def note_add_endpoint(file: Optional[UploadFile] = File(None), text: Optional[str] = Form(None), source: Optional[str] = Form(None), bg_tasks: BackgroundTasks = None):
    L.DEBUG(f"Received request on /note/add...")
    if not file and not text:
        L.WARN(f"... without any file or text!")
        raise HTTPException(status_code=400, detail="Either text or a file must be provided")
    else:
        result = await process_for_daily_note(file, text, source, bg_tasks)
        L.INFO(f"Result on /note/add: {result}")
        return JSONResponse(result, status_code=204)
 async def process_for_daily_note(file: Optional[UploadFile] = File(None), text: Optional[str] = None, source: Optional[str] = None, bg_tasks: BackgroundTasks = None):
    now = dt_datetime.now()
    transcription_entry = ""
    file_entry = ""
    if file:
        L.DEBUG("File received...")
        file_content = await file.read()
        audio_io = BytesIO(file_content)
        # Improve error handling for file type guessing
        guessed_type = mimetypes.guess_type(file.filename)
        file_type = guessed_type[0] if guessed_type[0] else "application/octet-stream"
        L.DEBUG(f"Processing as {file_type}...")
        # Extract the main type (e.g., 'audio', 'image', 'video')
        main_type = file_type.split('/')[0]
        subdir = main_type.title() if main_type else "Documents"
        absolute_path, relative_path = assemble_journal_path(now, subdir=subdir, filename=file.filename)
        L.DEBUG(f"Destination path: {absolute_path}")
        with open(absolute_path, 'wb') as f:
            f.write(file_content)
        L.DEBUG(f"Processing {f.name}...")
        if main_type == 'audio':
            transcription = await asr.transcribe_audio(file_path=absolute_path, params=asr.TranscribeParams(model="small-en", language="en", threads=6))
            file_entry = f"![[{relative_path}]]"
        elif main_type == 'image':
            file_entry = f"![[{relative_path}]]"
        else:
            file_entry = f"[Source]({relative_path})"
    text_entry = text if text else ""
    L.DEBUG(f"transcription: {transcription_entry}\nfile_entry: {file_entry}\ntext_entry: {text_entry}")
    return await add_to_daily_note(transcription_entry, file_entry, text_entry, now)
 async def add_to_daily_note(transcription: str = None, file_link: str = None, additional_text: str = None, date_time: dt_datetime = None):
    date_time = date_time or dt_datetime.now()
    note_path, _ = assemble_journal_path(date_time, filename='Notes', extension=".md", no_timestamp = True)
    time_str = date_time.strftime("%H:%M")
    entry_lines = []
    if additional_text and additional_text.strip():
        entry_lines.append(f"\t* {additional_text.strip()}") 
    if transcription and transcription.strip():
        entry_lines.append(f"\t* {transcription.strip()}") 
    if file_link and file_link.strip():
        entry_lines.append(f"\t\t {file_link.strip()}")
    entry = f"\n* **{time_str}**\n" + "\n".join(entry_lines)
    # Write the entry to the end of the file
    if note_path.exists():
        with open(note_path, 'a', encoding='utf-8') as note_file:
            note_file.write(entry)
    else: 
        date_str = date_time.strftime("%Y-%m-%d")
        frontmatter = f"""---
 date: {date_str}
 tags:
 - notes
 ---
 """
        content = frontmatter + entry
        # If the file doesn't exist, create it and start with "Notes"
        with open(note_path, 'w', encoding='utf-8') as note_file:
            note_file.write(content)
    return entry
 async def process_document(
    bg_tasks: BackgroundTasks,
    document: File,
    title: Optional[str] = None,
    tts_mode: str = "summary",
    voice: str = DEFAULT_VOICE
 ):
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    # Save the document to OBSIDIAN_RESOURCES_DIR
    document_content = await document.read()
    file_path = Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR / document.filename
    with open(file_path, 'wb') as f:
        f.write(document_content)
    parsed_content = await llm.extract_text(file_path)  # Ensure extract_text is awaited
    llm_title, summary = await llm.title_and_summary(parsed_content)
    try:
        readable_title = sanitize_filename(title if title else document.filename)
        if tts_mode == "full" or tts_mode == "content" or tts_mode == "body":
            tts_text = parsed_content
        elif tts_mode == "summary" or tts_mode == "excerpt":
            tts_text = summary
        else:
            tts_text = None
        frontmatter = f"""---
 title: {readable_title}
 added: {timestamp}
 ---
 """
        body = f"# {readable_title}\n\n"
        if tts_text:
            try:
                datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
                audio_filename = f"{datetime_str} {readable_title}"
                audio_path = await tts.generate_speech(
                    bg_tasks=bg_tasks,
                    text=tts_text,
                    voice=voice,
                    model="eleven_turbo_v2",
                    podcast=True,
                    title=audio_filename,
                    output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR
                )
                audio_ext = Path(audio_path).suffix
                obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
                body += f"{obsidian_link}\n\n"
            except Exception as e:
                L.ERR(f"Failed in the TTS portion of clipping: {e}")
        body += f"> [!summary]+\n"
        body += f"> {summary}\n\n"
        body += parsed_content
        markdown_content = frontmatter + body
        markdown_filename = f"{readable_title}.md"
        encoding = 'utf-8'
        with open(markdown_filename, 'w', encoding=encoding) as md_file:
            md_file.write(markdown_content)
        L.INFO(f"Successfully saved to {markdown_filename}")
        return markdown_filename
    except Exception as e:
        L.ERR(f"Failed to clip: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
 async def process_article(
    bg_tasks: BackgroundTasks,
    parsed_content: Article,
    tts_mode: str = "summary", 
    voice: str = DEFAULT_11L_VOICE
 ):
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    readable_title = sanitize_filename(parsed_content.title or timestamp)
    markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
    try:
        summary = await llm.summarize_text(parsed_content.clean_doc, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
        summary = summary.replace('\n', ' ')  # Remove line breaks
        if tts_mode == "full" or tts_mode == "content":
            tts_text = parsed_content.clean_doc
        elif tts_mode == "summary" or tts_mode == "excerpt":
            tts_text = summary
        else:
            tts_text = None
        banner_markdown = ''
        try:
            banner_url = parsed_content.top_image
            if banner_url != '':
                banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
                if banner_image:
                    banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
        except Exception as e:
            L.ERR(f"No image found in article")
        authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.authors)
        published_date = parsed_content.publish_date
        frontmatter = f"""---
 title: {readable_title}
 authors: {authors}
 published: {published_date}
 added: {timestamp}
 banner: "{banner_markdown}"
 tags:
 """
        frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.tags)
        frontmatter += '\n---\n'
        body = f"# {readable_title}\n\n"
        if tts_text:
            audio_filename = f"{published_date} {readable_title}"
            try:
                audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
                output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
                audio_ext = Path(audio_path).suffix
                obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
                body += f"{obsidian_link}\n\n"
            except Exception as e:
                L.ERR(f"Failed to generate TTS for np3k. {e}")
        try:
            body += f"by {authors} in {parsed_content.canonical_link}" # update with method for getting the newspaper name
            body += f"> [!summary]+\n"
            body += f"> {summary}\n\n"
            body += parsed_content["content"]
            markdown_content = frontmatter + body
        except Exception as e:
            L.ERR(f"Failed to combine elements of article markdown.")
        try:
            with open(markdown_filename, 'w') as md_file:
                md_file.write(markdown_content)
            L.INFO(f"Successfully saved to {markdown_filename}")
            add_to_daily_note
            return markdown_filename
        except Exception as e:
            L.ERR(f"Failed to write markdown file")
            raise HTTPException(status_code=500, detail=str(e))
    except Exception as e:
        L.ERR(f"Failed to clip: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
 async def process_article2(
    bg_tasks: BackgroundTasks,
    url: str,
    title: Optional[str] = None,
    encoding: str = 'utf-8',
    source: Optional[str] = None,
    tts_mode: str = "summary", 
    voice: str = DEFAULT_11L_VOICE
 ):
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    parsed_content = await parse_article(url, source)
    if parsed_content is None:
        return {"error": "Failed to retrieve content"}
    readable_title = sanitize_filename(title or parsed_content.get("title") or timestamp)
    markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
    try:
        summary = await llm.summarize_text(parsed_content["content"], "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
        summary = summary.replace('\n', ' ')  # Remove line breaks
        if tts_mode == "full" or tts_mode == "content":
            tts_text = parsed_content["content"]
        elif tts_mode == "summary" or tts_mode == "excerpt":
            tts_text = summary
        else:
            tts_text = None
        banner_markdown = ''
        try:
            banner_url = parsed_content.get('image', '')
            if banner_url != '':
                banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
                if banner_image:
                    banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
        except Exception as e:
            L.ERR(f"No image found in article")
        authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))
        frontmatter = f"""---
 title: {readable_title}
 authors: {', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))}
 published: {parsed_content.get('date_published', 'Unknown')}
 added: {timestamp}
 excerpt: {parsed_content.get('excerpt', '')}
 banner: "{banner_markdown}"
 tags:
 """
        frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.get('tags', []))
        frontmatter += '\n---\n'
        body = f"# {readable_title}\n\n"
        if tts_text:
            datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
            audio_filename = f"{datetime_str} {readable_title}"
            try:
                audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
                output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
                audio_ext = Path(audio_path).suffix
                obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
                body += f"{obsidian_link}\n\n"
            except Exception as e:
                L.ERR(f"Failed to generate TTS for np3k. {e}")
        try:
            body += f"by {authors} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({url}).\n\n"
            body += f"> [!summary]+\n"
            body += f"> {summary}\n\n"
            body += parsed_content["content"]
            markdown_content = frontmatter + body
        except Exception as e:
            L.ERR(f"Failed to combine elements of article markdown.")
        try:
            with open(markdown_filename, 'w', encoding=encoding) as md_file:
                md_file.write(markdown_content)
            L.INFO(f"Successfully saved to {markdown_filename}")
            add_to_daily_note
            return markdown_filename
        except Exception as e:
            L.ERR(f"Failed to write markdown file")
            raise HTTPException(status_code=500, detail=str(e))
    except Exception as e:
        L.ERR(f"Failed to clip {url}: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
 async def parse_article(url: str, source: Optional[str] = None) -> Article:
    source = source if source else trafilatura.fetch_url(url)
    traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
    # Create and parse the newspaper3k Article
    article = Article(url)
    article.set_html(source)
    article.parse()
    L.INFO(f"Parsed {article.title}")
    # Update or set properties based on trafilatura and additional processing
    article.title = article.title or traf.title or url
    article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
    article.publish_date = article.publish_date or traf.date
    try:
        article.publish_date = await loc.dt(article.publish_date, "UTC")
    except:
        L.DEBUG(f"Failed to localize {article.publish_date}")
        article.publish_date = await loc.dt(dt_datetime.now(), "UTC")
    article.meta_description = article.meta_description or traf.description
    article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
    article.top_image = article.top_image or traf.image
    article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
    article.meta_keywords = article.meta_keywords or traf.categories or traf.tags
    article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords]
    # Set additional data in the additional_data dictionary
    article.additional_data = {
        'excerpt': article.meta_description,
        'domain': article.source_url,
        'tags': article.meta_keywords,
        'content': article.text  # Store the markdown content here
    }
    return article
 async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
    if source:
        html_content = source
    elif url:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                html_content = await response.text()
    else:
        L.ERR(f"Unable to convert nothing to markdown.")
        return None
    # Use readability to extract the main content
    doc = Document(html_content)
    cleaned_html = doc.summary()
    # Parse the cleaned HTML with BeautifulSoup for any additional processing
    soup = BeautifulSoup(cleaned_html, 'html.parser')
    # Remove any remaining unwanted elements
    for element in soup(['script', 'style']):
        element.decompose()
    # Convert to markdown
    markdown_content = md(str(soup), heading_style="ATX")
    return markdown_content
 async def process_archive(
    url: str,
    title: Optional[str] = None,
    encoding: str = 'utf-8',
    source: Optional[str] = None,
 ) -> Path:
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    readable_title = title if title else f"{url} - {timestamp}"
    content = await html_to_markdown(url, source)
    if content is None:
        raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
    markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
    markdown_content = f"---\n"
    markdown_content += f"title: {readable_title}\n"
    markdown_content += f"added: {timestamp}\n"
    markdown_content += f"url: {url}"
    markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
    markdown_content += f"---\n\n"
    markdown_content += f"# {readable_title}\n\n"
    markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
    markdown_content += content
    try:
        markdown_path.parent.mkdir(parents=True, exist_ok=True)
        with open(markdown_path, 'w', encoding=encoding) as md_file:
            md_file.write(markdown_content)
        L.DEBUG(f"Successfully saved to {markdown_path}")
        return markdown_path
    except Exception as e:
        L.WARN(f"Failed to write markdown file: {str(e)}")
        return None
 def download_file(url, folder):
    os.makedirs(folder, exist_ok=True)
    filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
    filepath = os.path.join(folder, filename)
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        response = session.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            if 'image' in response.headers.get('Content-Type', ''):
                with open(filepath, 'wb') as f:
                    f.write(response.content)
            else:
                L.ERR(f"Failed to download image: {url}, invalid content type: {response.headers.get('Content-Type')}")
                return None
        else:
            L.ERR(f"Failed to download image: {url}, status code: {response.status_code}")
            return None
    except Exception as e:
        L.ERR(f"Failed to download image: {url}, error: {str(e)}")
        return None
    return filename
 def copy_file(local_path, folder):
    os.makedirs(folder, exist_ok=True)
    filename = os.path.basename(local_path)
    destination_path = os.path.join(folder, filename)
    shutil.copy(local_path, destination_path)
    return filename
 async def save_file(file: UploadFile, folder: Path) -> Path:
    file_path = folder / f"{dt_datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
    with open(file_path, 'wb') as f:
        shutil.copyfileobj(file.file, f)
    return file_path
--- a/sijapi/routers/note.py
+++ b/sijapi/routers/note.py
@ -3,26 +3,9 @@ Manages an Obsidian vault, in particular daily notes, using information and func
 '''
 from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
 from fastapi.responses import JSONResponse, PlainTextResponse
 from io import BytesIO
 from pydantic import BaseModel
 import os, re
 import uuid
 import aiohttp
 import traceback
 import requests
 import mimetypes
 import shutil
 from zoneinfo import ZoneInfo
 from bs4 import BeautifulSoup
 from markdownify import markdownify as md
 from typing import Optional, Union, Dict, List, Tuple
 from urllib.parse import urlparse
 from urllib3.util.retry import Retry
 import newspaper
 from newspaper import Article
 import trafilatura
 from readability import Document
 from requests.adapters import HTTPAdapter
 import re
 import os
 from datetime import timedelta, datetime as dt_datetime, time as dt_time, date as dt_date
@ -35,6 +18,7 @@ from sijapi.routers import cal, loc, tts, llm, time, sd, weather, asr
 from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, check_file_name, HOURLY_COLUMNS_MAPPING
 from sijapi.classes import Location
 note = APIRouter()
 def list_and_correct_impermissible_files(root_dir, rename: bool = False):
@ -827,535 +811,3 @@ async def update_daily_note_events(date_time: dt_datetime):
 ### CLIPPER ###
@note.post("/clip")
 async def clip_post(
    bg_tasks: BackgroundTasks,
    url: Optional[str] = Form(None),
    source: Optional[str] = Form(None),
    title: Optional[str] = Form(None),
    tts: str = Form('summary'),
    voice: str = Form(DEFAULT_VOICE),
    encoding: str = Form('utf-8')
 ):
    markdown_filename = await process_article(bg_tasks, url, title, encoding, source, tts, voice)
    return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@note.post("/archive")
 async def archive_post(
    url: Optional[str] = Form(None),
    source: Optional[str] = Form(None),
    title: Optional[str] = Form(None),
    encoding: str = Form('utf-8')
 ):
    markdown_filename = await process_archive(url, title, encoding, source)
    return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@note.get("/clip")
 async def clip_get(
    bg_tasks: BackgroundTasks,
    url: str,
    title: Optional[str] = Query(None),
    encoding: str = Query('utf-8'),
    tts: str = Query('summary'),
    voice: str = Query(DEFAULT_VOICE)
 ):
    markdown_filename = await process_article(bg_tasks, url, title, encoding, tts=tts, voice=voice)
    return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
@note.post("/note/add")
 async def note_add_endpoint(file: Optional[UploadFile] = File(None), text: Optional[str] = Form(None), source: Optional[str] = Form(None), bg_tasks: BackgroundTasks = None):
    L.DEBUG(f"Received request on /note/add...")
    if not file and not text:
        L.WARN(f"... without any file or text!")
        raise HTTPException(status_code=400, detail="Either text or a file must be provided")
    else:
        result = await process_for_daily_note(file, text, source, bg_tasks)
        L.INFO(f"Result on /note/add: {result}")
        return JSONResponse(result, status_code=204)
 async def process_for_daily_note(file: Optional[UploadFile] = File(None), text: Optional[str] = None, source: Optional[str] = None, bg_tasks: BackgroundTasks = None):
    now = dt_datetime.now()
    transcription_entry = ""
    file_entry = ""
    if file:
        L.DEBUG("File received...")
        file_content = await file.read()
        audio_io = BytesIO(file_content)
        # Improve error handling for file type guessing
        guessed_type = mimetypes.guess_type(file.filename)
        file_type = guessed_type[0] if guessed_type[0] else "application/octet-stream"
        L.DEBUG(f"Processing as {file_type}...")
        # Extract the main type (e.g., 'audio', 'image', 'video')
        main_type = file_type.split('/')[0]
        subdir = main_type.title() if main_type else "Documents"
        absolute_path, relative_path = assemble_journal_path(now, subdir=subdir, filename=file.filename)
        L.DEBUG(f"Destination path: {absolute_path}")
        with open(absolute_path, 'wb') as f:
            f.write(file_content)
        L.DEBUG(f"Processing {f.name}...")
        if main_type == 'audio':
            transcription = await asr.transcribe_audio(file_path=absolute_path, params=asr.TranscribeParams(model="small-en", language="en", threads=6))
            file_entry = f"![[{relative_path}]]"
        elif main_type == 'image':
            file_entry = f"![[{relative_path}]]"
        else:
            file_entry = f"[Source]({relative_path})"
    text_entry = text if text else ""
    L.DEBUG(f"transcription: {transcription_entry}\nfile_entry: {file_entry}\ntext_entry: {text_entry}")
    return await add_to_daily_note(transcription_entry, file_entry, text_entry, now)
 async def add_to_daily_note(transcription: str = None, file_link: str = None, additional_text: str = None, date_time: dt_datetime = None):
    date_time = date_time or dt_datetime.now()
    note_path, _ = assemble_journal_path(date_time, filename='Notes', extension=".md", no_timestamp = True)
    time_str = date_time.strftime("%H:%M")
    entry_lines = []
    if additional_text and additional_text.strip():
        entry_lines.append(f"\t* {additional_text.strip()}") 
    if transcription and transcription.strip():
        entry_lines.append(f"\t* {transcription.strip()}") 
    if file_link and file_link.strip():
        entry_lines.append(f"\t\t {file_link.strip()}")
    entry = f"\n* **{time_str}**\n" + "\n".join(entry_lines)
    # Write the entry to the end of the file
    if note_path.exists():
        with open(note_path, 'a', encoding='utf-8') as note_file:
            note_file.write(entry)
    else: 
        date_str = date_time.strftime("%Y-%m-%d")
        frontmatter = f"""---
 date: {date_str}
 tags:
 - notes
 ---
 """
        content = frontmatter + entry
        # If the file doesn't exist, create it and start with "Notes"
        with open(note_path, 'w', encoding='utf-8') as note_file:
            note_file.write(content)
    return entry
 async def process_document(
    bg_tasks: BackgroundTasks,
    document: File,
    title: Optional[str] = None,
    tts_mode: str = "summary",
    voice: str = DEFAULT_VOICE
 ):
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    # Save the document to OBSIDIAN_RESOURCES_DIR
    document_content = await document.read()
    file_path = Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR / document.filename
    with open(file_path, 'wb') as f:
        f.write(document_content)
    parsed_content = await llm.extract_text(file_path)  # Ensure extract_text is awaited
    llm_title, summary = await llm.title_and_summary(parsed_content)
    try:
        readable_title = sanitize_filename(title if title else document.filename)
        if tts_mode == "full" or tts_mode == "content" or tts_mode == "body":
            tts_text = parsed_content
        elif tts_mode == "summary" or tts_mode == "excerpt":
            tts_text = summary
        else:
            tts_text = None
        frontmatter = f"""---
 title: {readable_title}
 added: {timestamp}
 ---
 """
        body = f"# {readable_title}\n\n"
        if tts_text:
            try:
                datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
                audio_filename = f"{datetime_str} {readable_title}"
                audio_path = await tts.generate_speech(
                    bg_tasks=bg_tasks,
                    text=tts_text,
                    voice=voice,
                    model="eleven_turbo_v2",
                    podcast=True,
                    title=audio_filename,
                    output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR
                )
                audio_ext = Path(audio_path).suffix
                obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
                body += f"{obsidian_link}\n\n"
            except Exception as e:
                L.ERR(f"Failed in the TTS portion of clipping: {e}")
        body += f"> [!summary]+\n"
        body += f"> {summary}\n\n"
        body += parsed_content
        markdown_content = frontmatter + body
        markdown_filename = f"{readable_title}.md"
        encoding = 'utf-8'
        with open(markdown_filename, 'w', encoding=encoding) as md_file:
            md_file.write(markdown_content)
        L.INFO(f"Successfully saved to {markdown_filename}")
        return markdown_filename
    except Exception as e:
        L.ERR(f"Failed to clip: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
 async def process_article(
    bg_tasks: BackgroundTasks,
    parsed_content: Article,
    tts_mode: str = "summary", 
    voice: str = DEFAULT_11L_VOICE
 ):
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    readable_title = sanitize_filename(parsed_content.title or timestamp)
    markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
    try:
        summary = await llm.summarize_text(parsed_content.clean_doc, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
        summary = summary.replace('\n', ' ')  # Remove line breaks
        if tts_mode == "full" or tts_mode == "content":
            tts_text = parsed_content.clean_doc
        elif tts_mode == "summary" or tts_mode == "excerpt":
            tts_text = summary
        else:
            tts_text = None
        banner_markdown = ''
        try:
            banner_url = parsed_content.top_image
            if banner_url != '':
                banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
                if banner_image:
                    banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
        except Exception as e:
            L.ERR(f"No image found in article")
        authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.authors)
        published_date = parsed_content.publish_date
        frontmatter = f"""---
 title: {readable_title}
 authors: {authors}
 published: {published_date}
 added: {timestamp}
 banner: "{banner_markdown}"
 tags:
 """
        frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.tags)
        frontmatter += '\n---\n'
        body = f"# {readable_title}\n\n"
        if tts_text:
            audio_filename = f"{published_date} {readable_title}"
            try:
                audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
                output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
                audio_ext = Path(audio_path).suffix
                obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
                body += f"{obsidian_link}\n\n"
            except Exception as e:
                L.ERR(f"Failed to generate TTS for np3k. {e}")
        try:
            body += f"by {authors} in {parsed_content.canonical_link}" # update with method for getting the newspaper name
            body += f"> [!summary]+\n"
            body += f"> {summary}\n\n"
            body += parsed_content["content"]
            markdown_content = frontmatter + body
        except Exception as e:
            L.ERR(f"Failed to combine elements of article markdown.")
        try:
            with open(markdown_filename, 'w') as md_file:
                md_file.write(markdown_content)
            L.INFO(f"Successfully saved to {markdown_filename}")
            add_to_daily_note
            return markdown_filename
        except Exception as e:
            L.ERR(f"Failed to write markdown file")
            raise HTTPException(status_code=500, detail=str(e))
    except Exception as e:
        L.ERR(f"Failed to clip: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
 async def process_article2(
    bg_tasks: BackgroundTasks,
    url: str,
    title: Optional[str] = None,
    encoding: str = 'utf-8',
    source: Optional[str] = None,
    tts_mode: str = "summary", 
    voice: str = DEFAULT_11L_VOICE
 ):
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    parsed_content = await parse_article(url, source)
    if parsed_content is None:
        return {"error": "Failed to retrieve content"}
    readable_title = sanitize_filename(title or parsed_content.get("title") or timestamp)
    markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
    try:
        summary = await llm.summarize_text(parsed_content["content"], "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
        summary = summary.replace('\n', ' ')  # Remove line breaks
        if tts_mode == "full" or tts_mode == "content":
            tts_text = parsed_content["content"]
        elif tts_mode == "summary" or tts_mode == "excerpt":
            tts_text = summary
        else:
            tts_text = None
        banner_markdown = ''
        try:
            banner_url = parsed_content.get('image', '')
            if banner_url != '':
                banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
                if banner_image:
                    banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
        except Exception as e:
            L.ERR(f"No image found in article")
        authors = ', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))
        frontmatter = f"""---
 title: {readable_title}
 authors: {', '.join('[[{}]]'.format(author) for author in parsed_content.get('authors', ['Unknown']))}
 published: {parsed_content.get('date_published', 'Unknown')}
 added: {timestamp}
 excerpt: {parsed_content.get('excerpt', '')}
 banner: "{banner_markdown}"
 tags:
 """
        frontmatter += '\n'.join(f" - {tag}" for tag in parsed_content.get('tags', []))
        frontmatter += '\n---\n'
        body = f"# {readable_title}\n\n"
        if tts_text:
            datetime_str = dt_datetime.now().strftime("%Y%m%d%H%M%S")
            audio_filename = f"{datetime_str} {readable_title}"
            try:
                audio_path = await tts.generate_speech(bg_tasks=bg_tasks, text=tts_text, voice=voice, model="eleven_turbo_v2", podcast=True, title=audio_filename,
                output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR)
                audio_ext = Path(audio_path).suffix
                obsidian_link = f"![[{OBSIDIAN_RESOURCES_DIR}/{audio_filename}{audio_ext}]]"
                body += f"{obsidian_link}\n\n"
            except Exception as e:
                L.ERR(f"Failed to generate TTS for np3k. {e}")
        try:
            body += f"by {authors} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({url}).\n\n"
            body += f"> [!summary]+\n"
            body += f"> {summary}\n\n"
            body += parsed_content["content"]
            markdown_content = frontmatter + body
        except Exception as e:
            L.ERR(f"Failed to combine elements of article markdown.")
        try:
            with open(markdown_filename, 'w', encoding=encoding) as md_file:
                md_file.write(markdown_content)
            L.INFO(f"Successfully saved to {markdown_filename}")
            add_to_daily_note
            return markdown_filename
        except Exception as e:
            L.ERR(f"Failed to write markdown file")
            raise HTTPException(status_code=500, detail=str(e))
    except Exception as e:
        L.ERR(f"Failed to clip {url}: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
 async def parse_article(url: str, source: Optional[str] = None) -> Article:
    source = source if source else trafilatura.fetch_url(url)
    traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
    # Create and parse the newspaper3k Article
    article = Article(url)
    article.set_html(source)
    article.parse()
    L.INFO(f"Parsed {article.title}")
    # Update or set properties based on trafilatura and additional processing
    article.title = article.title or traf.title or url
    article.authors = article.authors or (traf.author if isinstance(traf.author, list) else [traf.author])
    article.publish_date = article.publish_date or traf.date
    try:
        article.publish_date = await loc.dt(article.publish_date, "UTC")
    except:
        L.DEBUG(f"Failed to localize {article.publish_date}")
        article.publish_date = await loc.dt(dt_datetime.now(), "UTC")
    article.meta_description = article.meta_description or traf.description
    article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) or article.text
    article.top_image = article.top_image or traf.image
    article.source_url = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
    article.meta_keywords = article.meta_keywords or traf.categories or traf.tags
    article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords]
    # Set additional data in the additional_data dictionary
    article.additional_data = {
        'excerpt': article.meta_description,
        'domain': article.source_url,
        'tags': article.meta_keywords,
        'content': article.text  # Store the markdown content here
    }
    return article
 async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
    if source:
        html_content = source
    elif url:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                html_content = await response.text()
    else:
        L.ERR(f"Unable to convert nothing to markdown.")
        return None
    # Use readability to extract the main content
    doc = Document(html_content)
    cleaned_html = doc.summary()
    # Parse the cleaned HTML with BeautifulSoup for any additional processing
    soup = BeautifulSoup(cleaned_html, 'html.parser')
    # Remove any remaining unwanted elements
    for element in soup(['script', 'style']):
        element.decompose()
    # Convert to markdown
    markdown_content = md(str(soup), heading_style="ATX")
    return markdown_content
 async def process_archive(
    url: str,
    title: Optional[str] = None,
    encoding: str = 'utf-8',
    source: Optional[str] = None,
 ) -> Path:
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    readable_title = title if title else f"{url} - {timestamp}"
    content = await html_to_markdown(url, source)
    if content is None:
        raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
    markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
    markdown_content = f"---\n"
    markdown_content += f"title: {readable_title}\n"
    markdown_content += f"added: {timestamp}\n"
    markdown_content += f"url: {url}"
    markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
    markdown_content += f"---\n\n"
    markdown_content += f"# {readable_title}\n\n"
    markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
    markdown_content += content
    try:
        markdown_path.parent.mkdir(parents=True, exist_ok=True)
        with open(markdown_path, 'w', encoding=encoding) as md_file:
            md_file.write(markdown_content)
        L.DEBUG(f"Successfully saved to {markdown_path}")
        return markdown_path
    except Exception as e:
        L.WARN(f"Failed to write markdown file: {str(e)}")
        return None
 def download_file(url, folder):
    os.makedirs(folder, exist_ok=True)
    filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
    filepath = os.path.join(folder, filename)
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        response = session.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            if 'image' in response.headers.get('Content-Type', ''):
                with open(filepath, 'wb') as f:
                    f.write(response.content)
            else:
                L.ERR(f"Failed to download image: {url}, invalid content type: {response.headers.get('Content-Type')}")
                return None
        else:
            L.ERR(f"Failed to download image: {url}, status code: {response.status_code}")
            return None
    except Exception as e:
        L.ERR(f"Failed to download image: {url}, error: {str(e)}")
        return None
    return filename
 def copy_file(local_path, folder):
    os.makedirs(folder, exist_ok=True)
    filename = os.path.basename(local_path)
    destination_path = os.path.join(folder, filename)
    shutil.copy(local_path, destination_path)
    return filename
 async def save_file(file: UploadFile, folder: Path) -> Path:
    file_path = folder / f"{dt_datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
    with open(file_path, 'wb') as f:
        shutil.copyfileobj(file.file, f)
    return file_path