From 8f095e5e71b87782ade19bbe93b5314e07d703bd Mon Sep 17 00:00:00 2001 From: sanj <67624670+iodrift@users.noreply.github.com> Date: Thu, 27 Jun 2024 09:46:17 -0700 Subject: [PATCH] Auto-update: Thu Jun 27 09:46:17 PDT 2024 --- sijapi/__init__.py | 4 +- sijapi/data/sd/workflows/wallpaper.json | 150 ++++++++++++------------ sijapi/routers/note.py | 115 +++++++++--------- sijapi/utilities.py | 67 +++++++---- 4 files changed, 175 insertions(+), 161 deletions(-) diff --git a/sijapi/__init__.py b/sijapi/__init__.py index 9e1d068..30f0290 100644 --- a/sijapi/__init__.py +++ b/sijapi/__init__.py @@ -72,7 +72,7 @@ DynamicTZ = TimezoneTracker(DB) ### Obsidian & notes ALLOWED_FILENAME_CHARS = r'[^\w \.-]' -MAX_FILENAME_LENGTH = 255 +MAX_PATH_LENGTH = 254 OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or HOME_DIR / "Nextcloud" / "notes") OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal" OBSIDIAN_RESOURCES_DIR = "obsidian/resources" @@ -80,6 +80,8 @@ OBSIDIAN_BANNER_DIR = f"{OBSIDIAN_RESOURCES_DIR}/banners" os.makedirs(Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_BANNER_DIR, exist_ok=True) OBSIDIAN_BANNER_SCENE = os.getenv("OBSIDIAN_BANNER_SCENE", "wallpaper") OBSIDIAN_CHROMADB_COLLECTION = os.getenv("OBSIDIAN_CHROMADB_COLLECTION", "obsidian") +ARCHIVE_DIR = Path(os.getenv("ARCHIVE_DIR", OBSIDIAN_VAULT_DIR / "archive")) +os.makedirs(ARCHIVE_DIR, exist_ok=True) DOC_DIR = DATA_DIR / "docs" os.makedirs(DOC_DIR, exist_ok=True) diff --git a/sijapi/data/sd/workflows/wallpaper.json b/sijapi/data/sd/workflows/wallpaper.json index e541bd6..fdf0062 100644 --- a/sijapi/data/sd/workflows/wallpaper.json +++ b/sijapi/data/sd/workflows/wallpaper.json @@ -51,23 +51,23 @@ "inputs": { "batch_size": 1, "width": 1023, - "height": 1025, + "height": 1024, "resampling": "bicubic", "X": 0, "Y": 0, "Z": 0, "evolution": 0.1, "frame": 1, - "scale": 13.1, + "scale": 6.66, "octaves": 8, - "persistence": 6.2, - "lacunarity": 5.38, - "exponent": 4.5600000000000005, - "brightness": -0.16, - "contrast": -0.13, + "persistence": 3, + "lacunarity": 6.66, + "exponent": 1, + "brightness": 0, + "contrast": 0, "clamp_min": 0, "clamp_max": 1, - "seed": 474669046020372, + "seed": 300432080108380, "device": "cpu", "optional_vae": [ "4", @@ -81,10 +81,10 @@ }, "13": { "inputs": { - "seed": 484066073734968, - "steps": 8, + "seed": 1125631171146107, + "steps": 10, "cfg": 1.8, - "sampler_name": "dpmpp_2m_sde", + "sampler_name": "dpmpp_2s_ancestral", "scheduler": "karras", "start_at_step": 0, "end_at_step": 10000, @@ -197,57 +197,6 @@ "title": "CLIP Text Encode (Prompt)" } }, - "22": { - "inputs": { - "upscale_by": 2, - "seed": 589846903558615, - "steps": 20, - "cfg": 1.6, - "sampler_name": "heun", - "scheduler": "sgm_uniform", - "denoise": 0.21, - "mode_type": "Linear", - "tile_width": 512, - "tile_height": 512, - "mask_blur": 8, - "tile_padding": 32, - "seam_fix_mode": "Band Pass", - "seam_fix_denoise": 1, - "seam_fix_width": 64, - "seam_fix_mask_blur": 8, - "seam_fix_padding": 16, - "force_uniform_tiles": true, - "tiled_decode": true, - "image": [ - "38", - 0 - ], - "model": [ - "4", - 0 - ], - "positive": [ - "6", - 0 - ], - "negative": [ - "23", - 0 - ], - "vae": [ - "4", - 2 - ], - "upscale_model": [ - "24", - 0 - ] - }, - "class_type": "UltimateSDUpscale", - "_meta": { - "title": "Ultimate SD Upscale" - } - }, "23": { "inputs": { "conditioning": [ @@ -276,7 +225,7 @@ 0 ], "image": [ - "22", + "39", 0 ] }, @@ -313,21 +262,6 @@ "title": "ImageBlur" } }, - "36": { - "inputs": { - "mode": "bicubic", - "factor": 1.25, - "align": "true", - "samples": [ - "13", - 0 - ] - }, - "class_type": "Latent Upscale by Factor (WAS)", - "_meta": { - "title": "Latent Upscale by Factor (WAS)" - } - }, "38": { "inputs": { "samples": [ @@ -343,5 +277,65 @@ "_meta": { "title": "VAE Decode" } + }, + "39": { + "inputs": { + "upscale_by": 2, + "seed": 687912408861107, + "steps": 20, + "cfg": 1.9000000000000001, + "sampler_name": "heun", + "scheduler": "sgm_uniform", + "denoise": 0.2, + "mode_type": "Linear", + "tile_width": 512, + "tile_height": 512, + "mask_blur": 8, + "tile_padding": 32, + "seam_fix_mode": "Band Pass", + "seam_fix_denoise": 1, + "seam_fix_width": 64, + "seam_fix_mask_blur": 8, + "seam_fix_padding": 16, + "force_uniform_tiles": true, + "tiled_decode": true, + "image": [ + "38", + 0 + ], + "model": [ + "4", + 0 + ], + "positive": [ + "6", + 0 + ], + "negative": [ + "23", + 0 + ], + "vae": [ + "4", + 2 + ], + "upscale_model": [ + "40", + 0 + ] + }, + "class_type": "UltimateSDUpscale", + "_meta": { + "title": "Ultimate SD Upscale" + } + }, + "40": { + "inputs": { + "model_name": "RealESRGAN_x2.pth" + }, + "class_type": "UpscaleModelLoader", + "_meta": { + "title": "Load Upscale Model" + } } } \ No newline at end of file diff --git a/sijapi/routers/note.py b/sijapi/routers/note.py index 2491cb0..d1d0813 100644 --- a/sijapi/routers/note.py +++ b/sijapi/routers/note.py @@ -7,15 +7,19 @@ from io import BytesIO from pydantic import BaseModel import os, re import uuid +import aiohttp import traceback import requests import mimetypes import shutil +from bs4 import BeautifulSoup +from markdownify import markdownify as md from typing import Optional, Union, Dict, List, Tuple from urllib.parse import urlparse from urllib3.util.retry import Retry from newspaper import Article import trafilatura +from readability import Document from requests.adapters import HTTPAdapter import re import os @@ -23,10 +27,10 @@ from datetime import timedelta, datetime, time as dt_time, date as dt_date from fastapi import HTTPException, status from pathlib import Path from fastapi import APIRouter, Query, HTTPException -from sijapi import L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, BASE_URL, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, TZ +from sijapi import L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, ARCHIVE_DIR, BASE_URL, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, TZ from sijapi.routers import tts, llm, time, sd, locate, weather, asr, calendar from sijapi.routers.locate import Location -from sijapi.utilities import assemble_journal_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, HOURLY_COLUMNS_MAPPING +from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, HOURLY_COLUMNS_MAPPING note = APIRouter() @@ -440,9 +444,9 @@ async def parse_article(url: str, source: Optional[str] = None): L.INFO(f"Parsed {np3k.title}") - title = np3k.title or traf.title + title = (np3k.title or traf.title) or url authors = np3k.authors or traf.author - authors = authors if isinstance(authors, List) else [authors] + authors = (authors if isinstance(authors, List) else [authors]) date = np3k.publish_date or traf.date try: date = await locate.localize_datetime(date) @@ -455,7 +459,7 @@ async def parse_article(url: str, source: Optional[str] = None): domain = traf.sitename or urlparse(url).netloc.replace('www.', '').title() tags = np3k.meta_keywords or traf.categories or traf.tags tags = tags if isinstance(tags, List) else [tags] - + return { 'title': title.replace(" ", " "), 'authors': authors, @@ -469,6 +473,33 @@ async def parse_article(url: str, source: Optional[str] = None): } +async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]: + if source: + html_content = source + elif url: + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + html_content = await response.text() + else: + L.ERR(f"Unable to convert nothing to markdown.") + return None + + # Use readability to extract the main content + doc = Document(html_content) + cleaned_html = doc.summary() + + # Parse the cleaned HTML with BeautifulSoup for any additional processing + soup = BeautifulSoup(cleaned_html, 'html.parser') + + # Remove any remaining unwanted elements + for element in soup(['script', 'style']): + element.decompose() + + # Convert to markdown + markdown_content = md(str(soup), heading_style="ATX") + + return markdown_content + async def process_archive( background_tasks: BackgroundTasks, @@ -476,59 +507,32 @@ async def process_archive( title: Optional[str] = None, encoding: str = 'utf-8', source: Optional[str] = None, -): - +) -> Path: timestamp = datetime.now().strftime('%b %d, %Y at %H:%M') - - parsed_content = await parse_article(url, source) - if parsed_content is None: - return {"error": "Failed to retrieve content"} - content = parsed_content["content"] - - readable_title = sanitize_filename(title if title else parsed_content.get("title", "Untitled")) - if not readable_title: - readable_title = timestamp - - markdown_path = OBSIDIAN_VAULT_DIR / "archive" - + readable_title = title if title else f"{url} - {timestamp}" + + content = await html_to_markdown(url, source) + if content is None: + raise HTTPException(status_code=400, detail="Failed to convert content to markdown") + + markdown_path, relative_path = assemble_archive_path(readable_title, ".md") + + markdown_content = f"---\n" + markdown_content += f"title: {readable_title}\n" + markdown_content += f"added: {timestamp}\n" + markdown_content += f"---\n\n" + markdown_content += f"# {readable_title}\n\n" + markdown_content += content + try: - frontmatter = f"""--- -title: {readable_title} -author: {parsed_content.get('author', 'Unknown')} -published: {parsed_content.get('date_published', 'Unknown')} -added: {timestamp} -excerpt: {parsed_content.get('excerpt', '')} ---- -""" - body = f"# {readable_title}\n\n" - - try: - authors = parsed_content.get('author', '') - authors_in_brackets = [f"[[{author.strip()}]]" for author in authors.split(",")] - authors_string = ", ".join(authors_in_brackets) - - body += f"by {authors_string} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({parsed_content.get('url', url)}).\n\n" - body += content - markdown_content = frontmatter + body - except Exception as e: - L.ERR(f"Failed to combine elements of article markdown.") - - try: - with open(markdown_path, 'w', encoding=encoding) as md_file: - md_file.write(markdown_content) - - L.INFO(f"Successfully saved to {markdown_path}") - add_to_daily_note - return markdown_path - - except Exception as e: - L.ERR(f"Failed to write markdown file") - raise HTTPException(status_code=500, detail=str(e)) - + markdown_path.parent.mkdir(parents=True, exist_ok=True) + with open(markdown_path, 'w', encoding=encoding) as md_file: + md_file.write(markdown_content) + L.INFO(f"Successfully saved to {markdown_path}") + return markdown_path except Exception as e: - L.ERR(f"Failed to clip {url}: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - + L.ERR(f"Failed to write markdown file: {str(e)}") + raise HTTPException(status_code=500, detail=f"Failed to write markdown file: {str(e)}") def download_file(url, folder): os.makedirs(folder, exist_ok=True) @@ -569,7 +573,6 @@ def copy_file(local_path, folder): return filename - async def save_file(file: UploadFile, folder: Path) -> Path: file_path = folder / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}" with open(file_path, 'wb') as f: diff --git a/sijapi/utilities.py b/sijapi/utilities.py index f5151e8..a23b8bf 100644 --- a/sijapi/utilities.py +++ b/sijapi/utilities.py @@ -25,7 +25,7 @@ import asyncpg from sshtunnel import SSHTunnelForwarder from fastapi import Depends, HTTPException, Request, UploadFile from fastapi.security.api_key import APIKeyHeader -from sijapi import L, GLOBAL_API_KEY, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_FILENAME_LENGTH +from sijapi import L, GLOBAL_API_KEY, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR api_key_header = APIKeyHeader(name="Authorization") @@ -38,6 +38,35 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)): raise HTTPException(status_code=401, detail="Invalid or missing API key") +def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]: + year = date_time.strftime(YEAR_FMT) + month = date_time.strftime(MONTH_FMT) + day = date_time.strftime(DAY_FMT) + day_short = date_time.strftime(DAY_SHORT_FMT) + timestamp = date_time.strftime("%H%M%S") + + # Ensure the extension is preserved + base_name, ext = os.path.splitext(filename) + extension = ext if ext else extension + + # Initial sanitization + sanitized_base = sanitize_filename(base_name, '') + filename = f"{day_short} {timestamp} {sanitized_base}{extension}" + + relative_path = Path(year) / month / day / filename + absolute_path = ARCHIVE_DIR / relative_path + + # Ensure the total path length doesn't exceed MAX_PATH_LENGTH + while len(str(absolute_path)) > MAX_PATH_LENGTH: + # Truncate the sanitized_base, not the full filename + sanitized_base = sanitized_base[:-1] + filename = f"{day_short} {timestamp} {sanitized_base}{extension}" + relative_path = Path(year) / month / day / filename + absolute_path = ARCHIVE_DIR / relative_path + + return absolute_path, relative_path + + def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]: ''' Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension. @@ -51,32 +80,22 @@ def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str timestamp = date_time.strftime("%H%M%S") relative_path = Path("journal") / year / month / day - if not subdir and not filename and not extension: - # standard daily note handler, where only the date_time was specified: relative_path = relative_path / f"{day}.md" else: - if subdir: - # datestamped subdirectory handler relative_path = relative_path / f"{day_short} {subdir}" if filename: - filename = sanitize_filename(filename) - filename = f"{day_short} {filename}" if no_timestamp else f"{day_short} {timestamp} {filename}" - if extension: extension = extension if extension.startswith(".") else f".{extension}" - filename = f"{filename}{extension}" if not filename.endswith(extension) else filename - else: - if has_valid_extension(filename, [".md", ".m4a", ".wav", ".aiff", ".flac", ".mp3", ".mp4", ".pdf", ".js", ".json", ".yaml", ".py"]): - L.DEBUG(f"Provided filename has a valid extension, so we use that.") - else: - filename = f"{filename}.md" - L.DEBUG(f"We are forcing the file to be a .md") - + extension = validate_extension(filename, [".md", ".m4a", ".wav", ".aiff", ".flac", ".mp3", ".mp4", ".pdf", ".js", ".json", ".yaml", ".py"]) or ".md" + + filename = sanitize_filename(filename) + filename = f"{day_short} {filename}" if no_timestamp else f"{day_short} {timestamp} {filename}" + filename = f"{filename}{extension}" if not filename.endswith(extension) else filename relative_path = relative_path / filename else: @@ -84,20 +103,16 @@ def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str return None, None absolute_path = OBSIDIAN_VAULT_DIR / relative_path - os.makedirs(absolute_path.parent, exist_ok=True) - return absolute_path, relative_path -def has_valid_extension(filename, valid_extensions=None): +def validate_extension(filename, valid_extensions=None): if valid_extensions is None: - # Check if there's any extension - return bool(os.path.splitext(filename)[1]) + return os.path.splitext(filename) else: - # Check if the extension is in the list of valid extensions - return os.path.splitext(filename)[1].lower() in valid_extensions - + extension = os.path.splitext(filename)[-1].lower() + return extension if extension in valid_extensions else None def prefix_lines(text: str, prefix: str = '> ') -> str: lines = text.split('\n') @@ -138,7 +153,7 @@ def get_extension(file): -def sanitize_filename(text, max_length=MAX_FILENAME_LENGTH): +def sanitize_filename(text, extension: str = None, max_length: int = MAX_PATH_LENGTH): """Sanitize a string to be used as a safe filename while protecting the file extension.""" L.DEBUG(f"Filename before sanitization: {text}") @@ -149,7 +164,7 @@ def sanitize_filename(text, max_length=MAX_FILENAME_LENGTH): max_base_length = max_length - len(extension) if len(base_name) > max_base_length: - base_name = base_name[:max_base_length].rstrip() + base_name = base_name[:max_base_length - 5].rstrip() final_filename = base_name + extension L.DEBUG(f"Filename after sanitization: {final_filename}")