From 8f095e5e71b87782ade19bbe93b5314e07d703bd Mon Sep 17 00:00:00 2001
From: sanj <67624670+iodrift@users.noreply.github.com>
Date: Thu, 27 Jun 2024 09:46:17 -0700
Subject: [PATCH] Auto-update: Thu Jun 27 09:46:17 PDT 2024

---
 sijapi/__init__.py                      |   4 +-
 sijapi/data/sd/workflows/wallpaper.json | 150 ++++++++++++------------
 sijapi/routers/note.py                  | 115 +++++++++---------
 sijapi/utilities.py                     |  67 +++++++----
 4 files changed, 175 insertions(+), 161 deletions(-)

diff --git a/sijapi/__init__.py b/sijapi/__init__.py
index 9e1d068..30f0290 100644
--- a/sijapi/__init__.py
+++ b/sijapi/__init__.py
@@ -72,7 +72,7 @@ DynamicTZ = TimezoneTracker(DB)
 
 ### Obsidian & notes
 ALLOWED_FILENAME_CHARS = r'[^\w \.-]'
-MAX_FILENAME_LENGTH = 255
+MAX_PATH_LENGTH = 254
 OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or HOME_DIR / "Nextcloud" / "notes")
 OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal"
 OBSIDIAN_RESOURCES_DIR = "obsidian/resources"
@@ -80,6 +80,8 @@ OBSIDIAN_BANNER_DIR = f"{OBSIDIAN_RESOURCES_DIR}/banners"
 os.makedirs(Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_BANNER_DIR, exist_ok=True)
 OBSIDIAN_BANNER_SCENE = os.getenv("OBSIDIAN_BANNER_SCENE", "wallpaper")
 OBSIDIAN_CHROMADB_COLLECTION = os.getenv("OBSIDIAN_CHROMADB_COLLECTION", "obsidian")
+ARCHIVE_DIR = Path(os.getenv("ARCHIVE_DIR", OBSIDIAN_VAULT_DIR / "archive"))
+os.makedirs(ARCHIVE_DIR, exist_ok=True)
 DOC_DIR = DATA_DIR / "docs"
 os.makedirs(DOC_DIR, exist_ok=True)
 
diff --git a/sijapi/data/sd/workflows/wallpaper.json b/sijapi/data/sd/workflows/wallpaper.json
index e541bd6..fdf0062 100644
--- a/sijapi/data/sd/workflows/wallpaper.json
+++ b/sijapi/data/sd/workflows/wallpaper.json
@@ -51,23 +51,23 @@
     "inputs": {
       "batch_size": 1,
       "width": 1023,
-      "height": 1025,
+      "height": 1024,
       "resampling": "bicubic",
       "X": 0,
       "Y": 0,
       "Z": 0,
       "evolution": 0.1,
       "frame": 1,
-      "scale": 13.1,
+      "scale": 6.66,
       "octaves": 8,
-      "persistence": 6.2,
-      "lacunarity": 5.38,
-      "exponent": 4.5600000000000005,
-      "brightness": -0.16,
-      "contrast": -0.13,
+      "persistence": 3,
+      "lacunarity": 6.66,
+      "exponent": 1,
+      "brightness": 0,
+      "contrast": 0,
       "clamp_min": 0,
       "clamp_max": 1,
-      "seed": 474669046020372,
+      "seed": 300432080108380,
       "device": "cpu",
       "optional_vae": [
         "4",
@@ -81,10 +81,10 @@
   },
   "13": {
     "inputs": {
-      "seed": 484066073734968,
-      "steps": 8,
+      "seed": 1125631171146107,
+      "steps": 10,
       "cfg": 1.8,
-      "sampler_name": "dpmpp_2m_sde",
+      "sampler_name": "dpmpp_2s_ancestral",
       "scheduler": "karras",
       "start_at_step": 0,
       "end_at_step": 10000,
@@ -197,57 +197,6 @@
       "title": "CLIP Text Encode (Prompt)"
     }
   },
-  "22": {
-    "inputs": {
-      "upscale_by": 2,
-      "seed": 589846903558615,
-      "steps": 20,
-      "cfg": 1.6,
-      "sampler_name": "heun",
-      "scheduler": "sgm_uniform",
-      "denoise": 0.21,
-      "mode_type": "Linear",
-      "tile_width": 512,
-      "tile_height": 512,
-      "mask_blur": 8,
-      "tile_padding": 32,
-      "seam_fix_mode": "Band Pass",
-      "seam_fix_denoise": 1,
-      "seam_fix_width": 64,
-      "seam_fix_mask_blur": 8,
-      "seam_fix_padding": 16,
-      "force_uniform_tiles": true,
-      "tiled_decode": true,
-      "image": [
-        "38",
-        0
-      ],
-      "model": [
-        "4",
-        0
-      ],
-      "positive": [
-        "6",
-        0
-      ],
-      "negative": [
-        "23",
-        0
-      ],
-      "vae": [
-        "4",
-        2
-      ],
-      "upscale_model": [
-        "24",
-        0
-      ]
-    },
-    "class_type": "UltimateSDUpscale",
-    "_meta": {
-      "title": "Ultimate SD Upscale"
-    }
-  },
   "23": {
     "inputs": {
       "conditioning": [
@@ -276,7 +225,7 @@
         0
       ],
       "image": [
-        "22",
+        "39",
         0
       ]
     },
@@ -313,21 +262,6 @@
       "title": "ImageBlur"
     }
   },
-  "36": {
-    "inputs": {
-      "mode": "bicubic",
-      "factor": 1.25,
-      "align": "true",
-      "samples": [
-        "13",
-        0
-      ]
-    },
-    "class_type": "Latent Upscale by Factor (WAS)",
-    "_meta": {
-      "title": "Latent Upscale by Factor (WAS)"
-    }
-  },
   "38": {
     "inputs": {
       "samples": [
@@ -343,5 +277,65 @@
     "_meta": {
       "title": "VAE Decode"
     }
+  },
+  "39": {
+    "inputs": {
+      "upscale_by": 2,
+      "seed": 687912408861107,
+      "steps": 20,
+      "cfg": 1.9000000000000001,
+      "sampler_name": "heun",
+      "scheduler": "sgm_uniform",
+      "denoise": 0.2,
+      "mode_type": "Linear",
+      "tile_width": 512,
+      "tile_height": 512,
+      "mask_blur": 8,
+      "tile_padding": 32,
+      "seam_fix_mode": "Band Pass",
+      "seam_fix_denoise": 1,
+      "seam_fix_width": 64,
+      "seam_fix_mask_blur": 8,
+      "seam_fix_padding": 16,
+      "force_uniform_tiles": true,
+      "tiled_decode": true,
+      "image": [
+        "38",
+        0
+      ],
+      "model": [
+        "4",
+        0
+      ],
+      "positive": [
+        "6",
+        0
+      ],
+      "negative": [
+        "23",
+        0
+      ],
+      "vae": [
+        "4",
+        2
+      ],
+      "upscale_model": [
+        "40",
+        0
+      ]
+    },
+    "class_type": "UltimateSDUpscale",
+    "_meta": {
+      "title": "Ultimate SD Upscale"
+    }
+  },
+  "40": {
+    "inputs": {
+      "model_name": "RealESRGAN_x2.pth"
+    },
+    "class_type": "UpscaleModelLoader",
+    "_meta": {
+      "title": "Load Upscale Model"
+    }
   }
 }
\ No newline at end of file
diff --git a/sijapi/routers/note.py b/sijapi/routers/note.py
index 2491cb0..d1d0813 100644
--- a/sijapi/routers/note.py
+++ b/sijapi/routers/note.py
@@ -7,15 +7,19 @@ from io import BytesIO
 from pydantic import BaseModel
 import os, re
 import uuid
+import aiohttp
 import traceback
 import requests
 import mimetypes
 import shutil
+from bs4 import BeautifulSoup
+from markdownify import markdownify as md
 from typing import Optional, Union, Dict, List, Tuple
 from urllib.parse import urlparse
 from urllib3.util.retry import Retry
 from newspaper import Article
 import trafilatura
+from readability import Document
 from requests.adapters import HTTPAdapter
 import re
 import os
@@ -23,10 +27,10 @@ from datetime import timedelta, datetime, time as dt_time, date as dt_date
 from fastapi import HTTPException, status
 from pathlib import Path
 from fastapi import APIRouter, Query, HTTPException
-from sijapi import L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, BASE_URL, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, TZ
+from sijapi import L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, ARCHIVE_DIR, BASE_URL, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, TZ
 from sijapi.routers import tts, llm, time, sd, locate, weather, asr, calendar
 from sijapi.routers.locate import Location
-from sijapi.utilities import assemble_journal_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, HOURLY_COLUMNS_MAPPING
+from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, HOURLY_COLUMNS_MAPPING
 
 
 note = APIRouter()
@@ -440,9 +444,9 @@ async def parse_article(url: str, source: Optional[str] = None):
     L.INFO(f"Parsed {np3k.title}")
     
 
-    title = np3k.title or traf.title
+    title = (np3k.title or traf.title) or url
     authors = np3k.authors or traf.author
-    authors = authors if isinstance(authors, List) else [authors]
+    authors = (authors if isinstance(authors, List) else [authors])
     date = np3k.publish_date or traf.date
     try:
         date = await locate.localize_datetime(date)
@@ -455,7 +459,7 @@ async def parse_article(url: str, source: Optional[str] = None):
     domain = traf.sitename or urlparse(url).netloc.replace('www.', '').title()
     tags = np3k.meta_keywords or traf.categories or traf.tags
     tags = tags if isinstance(tags, List) else [tags]
-
+    
     return {
         'title': title.replace("  ", " "),
         'authors': authors,
@@ -469,6 +473,33 @@ async def parse_article(url: str, source: Optional[str] = None):
     }
 
 
+async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
+    if source:
+        html_content = source
+    elif url:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                html_content = await response.text()
+    else:
+        L.ERR(f"Unable to convert nothing to markdown.")
+        return None
+
+    # Use readability to extract the main content
+    doc = Document(html_content)
+    cleaned_html = doc.summary()
+
+    # Parse the cleaned HTML with BeautifulSoup for any additional processing
+    soup = BeautifulSoup(cleaned_html, 'html.parser')
+
+    # Remove any remaining unwanted elements
+    for element in soup(['script', 'style']):
+        element.decompose()
+
+    # Convert to markdown
+    markdown_content = md(str(soup), heading_style="ATX")
+
+    return markdown_content
+
 
 async def process_archive(
     background_tasks: BackgroundTasks,
@@ -476,59 +507,32 @@ async def process_archive(
     title: Optional[str] = None,
     encoding: str = 'utf-8',
     source: Optional[str] = None,
-):
-
+) -> Path:
     timestamp = datetime.now().strftime('%b %d, %Y at %H:%M')
-
-    parsed_content = await parse_article(url, source)
-    if parsed_content is None:
-        return {"error": "Failed to retrieve content"}
-    content = parsed_content["content"]
-
-    readable_title = sanitize_filename(title if title else parsed_content.get("title", "Untitled"))
-    if not readable_title:
-        readable_title = timestamp
-
-    markdown_path = OBSIDIAN_VAULT_DIR / "archive"
-
+    readable_title = title if title else f"{url} - {timestamp}"
+    
+    content = await html_to_markdown(url, source)
+    if content is None:
+        raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
+    
+    markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
+    
+    markdown_content = f"---\n"
+    markdown_content += f"title: {readable_title}\n"
+    markdown_content += f"added: {timestamp}\n"
+    markdown_content += f"---\n\n"
+    markdown_content += f"# {readable_title}\n\n"
+    markdown_content += content
+    
     try:
-        frontmatter = f"""---
-title: {readable_title}
-author: {parsed_content.get('author', 'Unknown')}
-published: {parsed_content.get('date_published', 'Unknown')}
-added: {timestamp}
-excerpt: {parsed_content.get('excerpt', '')}
----
-"""
-        body = f"# {readable_title}\n\n"
-
-        try:
-            authors = parsed_content.get('author', '')
-            authors_in_brackets = [f"[[{author.strip()}]]" for author in authors.split(",")]
-            authors_string = ", ".join(authors_in_brackets)
-
-            body += f"by {authors_string} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({parsed_content.get('url', url)}).\n\n"
-            body += content
-            markdown_content = frontmatter + body
-        except Exception as e:
-            L.ERR(f"Failed to combine elements of article markdown.")
-
-        try:
-            with open(markdown_path, 'w', encoding=encoding) as md_file:
-                md_file.write(markdown_content)
-
-            L.INFO(f"Successfully saved to {markdown_path}")
-            add_to_daily_note
-            return markdown_path
-        
-        except Exception as e:
-            L.ERR(f"Failed to write markdown file")
-            raise HTTPException(status_code=500, detail=str(e))
-        
+        markdown_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(markdown_path, 'w', encoding=encoding) as md_file:
+            md_file.write(markdown_content)
+        L.INFO(f"Successfully saved to {markdown_path}")
+        return markdown_path
     except Exception as e:
-        L.ERR(f"Failed to clip {url}: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-
+        L.ERR(f"Failed to write markdown file: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Failed to write markdown file: {str(e)}")
 
 def download_file(url, folder):
     os.makedirs(folder, exist_ok=True)
@@ -569,7 +573,6 @@ def copy_file(local_path, folder):
     return filename
 
 
-
 async def save_file(file: UploadFile, folder: Path) -> Path:
     file_path = folder / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
     with open(file_path, 'wb') as f:
diff --git a/sijapi/utilities.py b/sijapi/utilities.py
index f5151e8..a23b8bf 100644
--- a/sijapi/utilities.py
+++ b/sijapi/utilities.py
@@ -25,7 +25,7 @@ import asyncpg
 from sshtunnel import SSHTunnelForwarder
 from fastapi import Depends, HTTPException, Request, UploadFile
 from fastapi.security.api_key import APIKeyHeader
-from sijapi import L, GLOBAL_API_KEY, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_FILENAME_LENGTH
+from sijapi import L, GLOBAL_API_KEY, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
 
 api_key_header = APIKeyHeader(name="Authorization")
 
@@ -38,6 +38,35 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)):
             raise HTTPException(status_code=401, detail="Invalid or missing API key")
 
 
+def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
+    year = date_time.strftime(YEAR_FMT)
+    month = date_time.strftime(MONTH_FMT)
+    day = date_time.strftime(DAY_FMT)
+    day_short = date_time.strftime(DAY_SHORT_FMT)
+    timestamp = date_time.strftime("%H%M%S")
+    
+    # Ensure the extension is preserved
+    base_name, ext = os.path.splitext(filename)
+    extension = ext if ext else extension
+    
+    # Initial sanitization
+    sanitized_base = sanitize_filename(base_name, '')
+    filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
+    
+    relative_path = Path(year) / month / day / filename
+    absolute_path = ARCHIVE_DIR / relative_path
+    
+    # Ensure the total path length doesn't exceed MAX_PATH_LENGTH
+    while len(str(absolute_path)) > MAX_PATH_LENGTH:
+        # Truncate the sanitized_base, not the full filename
+        sanitized_base = sanitized_base[:-1]
+        filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
+        relative_path = Path(year) / month / day / filename
+        absolute_path = ARCHIVE_DIR / relative_path
+    
+    return absolute_path, relative_path
+
+
 def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]:
     '''
     Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension.
@@ -51,32 +80,22 @@ def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str
     timestamp = date_time.strftime("%H%M%S")
 
     relative_path = Path("journal") / year / month / day
-
     if not subdir and not filename and not extension:
-        # standard daily note handler, where only the date_time was specified:
         relative_path = relative_path / f"{day}.md"
 
     else:
-        
         if subdir:
-            # datestamped subdirectory handler
             relative_path = relative_path / f"{day_short} {subdir}"
 
         if filename:
-            filename = sanitize_filename(filename)
-            filename = f"{day_short} {filename}" if no_timestamp else f"{day_short} {timestamp} {filename}"
-
             if extension:
                 extension = extension if extension.startswith(".") else f".{extension}"
-                filename = f"{filename}{extension}" if not filename.endswith(extension) else filename
-
             else:
-                if has_valid_extension(filename, [".md", ".m4a", ".wav", ".aiff", ".flac", ".mp3", ".mp4", ".pdf", ".js", ".json", ".yaml", ".py"]):
-                    L.DEBUG(f"Provided filename has a valid extension, so we use that.")
-                else:
-                    filename = f"{filename}.md"
-                    L.DEBUG(f"We are forcing the file to be a .md")
-  
+                extension = validate_extension(filename, [".md", ".m4a", ".wav", ".aiff", ".flac", ".mp3", ".mp4", ".pdf", ".js", ".json", ".yaml", ".py"]) or ".md"
+               
+            filename = sanitize_filename(filename)
+            filename = f"{day_short} {filename}" if no_timestamp else f"{day_short} {timestamp} {filename}"
+            filename = f"{filename}{extension}" if not filename.endswith(extension) else filename
             relative_path = relative_path / filename
         
         else:
@@ -84,20 +103,16 @@ def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str
             return None, None
     
     absolute_path = OBSIDIAN_VAULT_DIR / relative_path 
-
     os.makedirs(absolute_path.parent, exist_ok=True)
- 
     return absolute_path, relative_path
 
 
-def has_valid_extension(filename, valid_extensions=None):
+def validate_extension(filename, valid_extensions=None):
     if valid_extensions is None:
-        # Check if there's any extension
-        return bool(os.path.splitext(filename)[1])
+        return os.path.splitext(filename)
     else:
-        # Check if the extension is in the list of valid extensions
-        return os.path.splitext(filename)[1].lower() in valid_extensions
-    
+        extension = os.path.splitext(filename)[-1].lower()
+        return extension if extension in valid_extensions else None
 
 def prefix_lines(text: str, prefix: str = '> ') -> str:
     lines = text.split('\n')
@@ -138,7 +153,7 @@ def get_extension(file):
 
 
 
-def sanitize_filename(text, max_length=MAX_FILENAME_LENGTH):
+def sanitize_filename(text, extension: str = None, max_length: int = MAX_PATH_LENGTH):
     """Sanitize a string to be used as a safe filename while protecting the file extension."""
     L.DEBUG(f"Filename before sanitization: {text}")
 
@@ -149,7 +164,7 @@ def sanitize_filename(text, max_length=MAX_FILENAME_LENGTH):
 
     max_base_length = max_length - len(extension)
     if len(base_name) > max_base_length:
-        base_name = base_name[:max_base_length].rstrip()
+        base_name = base_name[:max_base_length - 5].rstrip()
     final_filename = base_name + extension
 
     L.DEBUG(f"Filename after sanitization: {final_filename}")