Auto-update: Thu Jun 27 09:46:17 PDT 2024

This commit is contained in:
sanj 2024-06-27 09:46:17 -07:00
parent d1f5c923ca
commit 8f095e5e71
4 changed files with 175 additions and 161 deletions

View file

@ -72,7 +72,7 @@ DynamicTZ = TimezoneTracker(DB)
### Obsidian & notes
ALLOWED_FILENAME_CHARS = r'[^\w \.-]'
MAX_FILENAME_LENGTH = 255
MAX_PATH_LENGTH = 254
OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or HOME_DIR / "Nextcloud" / "notes")
OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal"
OBSIDIAN_RESOURCES_DIR = "obsidian/resources"
@ -80,6 +80,8 @@ OBSIDIAN_BANNER_DIR = f"{OBSIDIAN_RESOURCES_DIR}/banners"
os.makedirs(Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_BANNER_DIR, exist_ok=True)
OBSIDIAN_BANNER_SCENE = os.getenv("OBSIDIAN_BANNER_SCENE", "wallpaper")
OBSIDIAN_CHROMADB_COLLECTION = os.getenv("OBSIDIAN_CHROMADB_COLLECTION", "obsidian")
ARCHIVE_DIR = Path(os.getenv("ARCHIVE_DIR", OBSIDIAN_VAULT_DIR / "archive"))
os.makedirs(ARCHIVE_DIR, exist_ok=True)
DOC_DIR = DATA_DIR / "docs"
os.makedirs(DOC_DIR, exist_ok=True)

View file

@ -51,23 +51,23 @@
"inputs": {
"batch_size": 1,
"width": 1023,
"height": 1025,
"height": 1024,
"resampling": "bicubic",
"X": 0,
"Y": 0,
"Z": 0,
"evolution": 0.1,
"frame": 1,
"scale": 13.1,
"scale": 6.66,
"octaves": 8,
"persistence": 6.2,
"lacunarity": 5.38,
"exponent": 4.5600000000000005,
"brightness": -0.16,
"contrast": -0.13,
"persistence": 3,
"lacunarity": 6.66,
"exponent": 1,
"brightness": 0,
"contrast": 0,
"clamp_min": 0,
"clamp_max": 1,
"seed": 474669046020372,
"seed": 300432080108380,
"device": "cpu",
"optional_vae": [
"4",
@ -81,10 +81,10 @@
},
"13": {
"inputs": {
"seed": 484066073734968,
"steps": 8,
"seed": 1125631171146107,
"steps": 10,
"cfg": 1.8,
"sampler_name": "dpmpp_2m_sde",
"sampler_name": "dpmpp_2s_ancestral",
"scheduler": "karras",
"start_at_step": 0,
"end_at_step": 10000,
@ -197,57 +197,6 @@
"title": "CLIP Text Encode (Prompt)"
}
},
"22": {
"inputs": {
"upscale_by": 2,
"seed": 589846903558615,
"steps": 20,
"cfg": 1.6,
"sampler_name": "heun",
"scheduler": "sgm_uniform",
"denoise": 0.21,
"mode_type": "Linear",
"tile_width": 512,
"tile_height": 512,
"mask_blur": 8,
"tile_padding": 32,
"seam_fix_mode": "Band Pass",
"seam_fix_denoise": 1,
"seam_fix_width": 64,
"seam_fix_mask_blur": 8,
"seam_fix_padding": 16,
"force_uniform_tiles": true,
"tiled_decode": true,
"image": [
"38",
0
],
"model": [
"4",
0
],
"positive": [
"6",
0
],
"negative": [
"23",
0
],
"vae": [
"4",
2
],
"upscale_model": [
"24",
0
]
},
"class_type": "UltimateSDUpscale",
"_meta": {
"title": "Ultimate SD Upscale"
}
},
"23": {
"inputs": {
"conditioning": [
@ -276,7 +225,7 @@
0
],
"image": [
"22",
"39",
0
]
},
@ -313,21 +262,6 @@
"title": "ImageBlur"
}
},
"36": {
"inputs": {
"mode": "bicubic",
"factor": 1.25,
"align": "true",
"samples": [
"13",
0
]
},
"class_type": "Latent Upscale by Factor (WAS)",
"_meta": {
"title": "Latent Upscale by Factor (WAS)"
}
},
"38": {
"inputs": {
"samples": [
@ -343,5 +277,65 @@
"_meta": {
"title": "VAE Decode"
}
},
"39": {
"inputs": {
"upscale_by": 2,
"seed": 687912408861107,
"steps": 20,
"cfg": 1.9000000000000001,
"sampler_name": "heun",
"scheduler": "sgm_uniform",
"denoise": 0.2,
"mode_type": "Linear",
"tile_width": 512,
"tile_height": 512,
"mask_blur": 8,
"tile_padding": 32,
"seam_fix_mode": "Band Pass",
"seam_fix_denoise": 1,
"seam_fix_width": 64,
"seam_fix_mask_blur": 8,
"seam_fix_padding": 16,
"force_uniform_tiles": true,
"tiled_decode": true,
"image": [
"38",
0
],
"model": [
"4",
0
],
"positive": [
"6",
0
],
"negative": [
"23",
0
],
"vae": [
"4",
2
],
"upscale_model": [
"40",
0
]
},
"class_type": "UltimateSDUpscale",
"_meta": {
"title": "Ultimate SD Upscale"
}
},
"40": {
"inputs": {
"model_name": "RealESRGAN_x2.pth"
},
"class_type": "UpscaleModelLoader",
"_meta": {
"title": "Load Upscale Model"
}
}
}

View file

@ -7,15 +7,19 @@ from io import BytesIO
from pydantic import BaseModel
import os, re
import uuid
import aiohttp
import traceback
import requests
import mimetypes
import shutil
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from typing import Optional, Union, Dict, List, Tuple
from urllib.parse import urlparse
from urllib3.util.retry import Retry
from newspaper import Article
import trafilatura
from readability import Document
from requests.adapters import HTTPAdapter
import re
import os
@ -23,10 +27,10 @@ from datetime import timedelta, datetime, time as dt_time, date as dt_date
from fastapi import HTTPException, status
from pathlib import Path
from fastapi import APIRouter, Query, HTTPException
from sijapi import L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, BASE_URL, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, TZ
from sijapi import L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, ARCHIVE_DIR, BASE_URL, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, TZ
from sijapi.routers import tts, llm, time, sd, locate, weather, asr, calendar
from sijapi.routers.locate import Location
from sijapi.utilities import assemble_journal_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, HOURLY_COLUMNS_MAPPING
from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, HOURLY_COLUMNS_MAPPING
note = APIRouter()
@ -440,9 +444,9 @@ async def parse_article(url: str, source: Optional[str] = None):
L.INFO(f"Parsed {np3k.title}")
title = np3k.title or traf.title
title = (np3k.title or traf.title) or url
authors = np3k.authors or traf.author
authors = authors if isinstance(authors, List) else [authors]
authors = (authors if isinstance(authors, List) else [authors])
date = np3k.publish_date or traf.date
try:
date = await locate.localize_datetime(date)
@ -469,6 +473,33 @@ async def parse_article(url: str, source: Optional[str] = None):
}
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
if source:
html_content = source
elif url:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html_content = await response.text()
else:
L.ERR(f"Unable to convert nothing to markdown.")
return None
# Use readability to extract the main content
doc = Document(html_content)
cleaned_html = doc.summary()
# Parse the cleaned HTML with BeautifulSoup for any additional processing
soup = BeautifulSoup(cleaned_html, 'html.parser')
# Remove any remaining unwanted elements
for element in soup(['script', 'style']):
element.decompose()
# Convert to markdown
markdown_content = md(str(soup), heading_style="ATX")
return markdown_content
async def process_archive(
background_tasks: BackgroundTasks,
@ -476,59 +507,32 @@ async def process_archive(
title: Optional[str] = None,
encoding: str = 'utf-8',
source: Optional[str] = None,
):
) -> Path:
timestamp = datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = title if title else f"{url} - {timestamp}"
parsed_content = await parse_article(url, source)
if parsed_content is None:
return {"error": "Failed to retrieve content"}
content = parsed_content["content"]
content = await html_to_markdown(url, source)
if content is None:
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
readable_title = sanitize_filename(title if title else parsed_content.get("title", "Untitled"))
if not readable_title:
readable_title = timestamp
markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
markdown_path = OBSIDIAN_VAULT_DIR / "archive"
try:
frontmatter = f"""---
title: {readable_title}
author: {parsed_content.get('author', 'Unknown')}
published: {parsed_content.get('date_published', 'Unknown')}
added: {timestamp}
excerpt: {parsed_content.get('excerpt', '')}
---
"""
body = f"# {readable_title}\n\n"
try:
authors = parsed_content.get('author', '')
authors_in_brackets = [f"[[{author.strip()}]]" for author in authors.split(",")]
authors_string = ", ".join(authors_in_brackets)
body += f"by {authors_string} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({parsed_content.get('url', url)}).\n\n"
body += content
markdown_content = frontmatter + body
except Exception as e:
L.ERR(f"Failed to combine elements of article markdown.")
markdown_content = f"---\n"
markdown_content += f"title: {readable_title}\n"
markdown_content += f"added: {timestamp}\n"
markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n"
markdown_content += content
try:
markdown_path.parent.mkdir(parents=True, exist_ok=True)
with open(markdown_path, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_path}")
add_to_daily_note
return markdown_path
except Exception as e:
L.ERR(f"Failed to write markdown file")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
L.ERR(f"Failed to clip {url}: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
L.ERR(f"Failed to write markdown file: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to write markdown file: {str(e)}")
def download_file(url, folder):
os.makedirs(folder, exist_ok=True)
@ -569,7 +573,6 @@ def copy_file(local_path, folder):
return filename
async def save_file(file: UploadFile, folder: Path) -> Path:
file_path = folder / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
with open(file_path, 'wb') as f:

View file

@ -25,7 +25,7 @@ import asyncpg
from sshtunnel import SSHTunnelForwarder
from fastapi import Depends, HTTPException, Request, UploadFile
from fastapi.security.api_key import APIKeyHeader
from sijapi import L, GLOBAL_API_KEY, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_FILENAME_LENGTH
from sijapi import L, GLOBAL_API_KEY, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
api_key_header = APIKeyHeader(name="Authorization")
@ -38,6 +38,35 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)):
raise HTTPException(status_code=401, detail="Invalid or missing API key")
def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
year = date_time.strftime(YEAR_FMT)
month = date_time.strftime(MONTH_FMT)
day = date_time.strftime(DAY_FMT)
day_short = date_time.strftime(DAY_SHORT_FMT)
timestamp = date_time.strftime("%H%M%S")
# Ensure the extension is preserved
base_name, ext = os.path.splitext(filename)
extension = ext if ext else extension
# Initial sanitization
sanitized_base = sanitize_filename(base_name, '')
filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path
# Ensure the total path length doesn't exceed MAX_PATH_LENGTH
while len(str(absolute_path)) > MAX_PATH_LENGTH:
# Truncate the sanitized_base, not the full filename
sanitized_base = sanitized_base[:-1]
filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path
return absolute_path, relative_path
def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]:
'''
Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension.
@ -51,32 +80,22 @@ def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str
timestamp = date_time.strftime("%H%M%S")
relative_path = Path("journal") / year / month / day
if not subdir and not filename and not extension:
# standard daily note handler, where only the date_time was specified:
relative_path = relative_path / f"{day}.md"
else:
if subdir:
# datestamped subdirectory handler
relative_path = relative_path / f"{day_short} {subdir}"
if filename:
filename = sanitize_filename(filename)
filename = f"{day_short} {filename}" if no_timestamp else f"{day_short} {timestamp} {filename}"
if extension:
extension = extension if extension.startswith(".") else f".{extension}"
else:
extension = validate_extension(filename, [".md", ".m4a", ".wav", ".aiff", ".flac", ".mp3", ".mp4", ".pdf", ".js", ".json", ".yaml", ".py"]) or ".md"
filename = sanitize_filename(filename)
filename = f"{day_short} {filename}" if no_timestamp else f"{day_short} {timestamp} {filename}"
filename = f"{filename}{extension}" if not filename.endswith(extension) else filename
else:
if has_valid_extension(filename, [".md", ".m4a", ".wav", ".aiff", ".flac", ".mp3", ".mp4", ".pdf", ".js", ".json", ".yaml", ".py"]):
L.DEBUG(f"Provided filename has a valid extension, so we use that.")
else:
filename = f"{filename}.md"
L.DEBUG(f"We are forcing the file to be a .md")
relative_path = relative_path / filename
else:
@ -84,20 +103,16 @@ def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str
return None, None
absolute_path = OBSIDIAN_VAULT_DIR / relative_path
os.makedirs(absolute_path.parent, exist_ok=True)
return absolute_path, relative_path
def has_valid_extension(filename, valid_extensions=None):
def validate_extension(filename, valid_extensions=None):
if valid_extensions is None:
# Check if there's any extension
return bool(os.path.splitext(filename)[1])
return os.path.splitext(filename)
else:
# Check if the extension is in the list of valid extensions
return os.path.splitext(filename)[1].lower() in valid_extensions
extension = os.path.splitext(filename)[-1].lower()
return extension if extension in valid_extensions else None
def prefix_lines(text: str, prefix: str = '> ') -> str:
lines = text.split('\n')
@ -138,7 +153,7 @@ def get_extension(file):
def sanitize_filename(text, max_length=MAX_FILENAME_LENGTH):
def sanitize_filename(text, extension: str = None, max_length: int = MAX_PATH_LENGTH):
"""Sanitize a string to be used as a safe filename while protecting the file extension."""
L.DEBUG(f"Filename before sanitization: {text}")
@ -149,7 +164,7 @@ def sanitize_filename(text, max_length=MAX_FILENAME_LENGTH):
max_base_length = max_length - len(extension)
if len(base_name) > max_base_length:
base_name = base_name[:max_base_length].rstrip()
base_name = base_name[:max_base_length - 5].rstrip()
final_filename = base_name + extension
L.DEBUG(f"Filename after sanitization: {final_filename}")