Auto-update: Thu Jun 27 09:46:17 PDT 2024

This commit is contained in:
sanj 2024-06-27 09:46:17 -07:00
parent d1f5c923ca
commit 8f095e5e71
4 changed files with 175 additions and 161 deletions

View file

@ -72,7 +72,7 @@ DynamicTZ = TimezoneTracker(DB)
### Obsidian & notes ### Obsidian & notes
ALLOWED_FILENAME_CHARS = r'[^\w \.-]' ALLOWED_FILENAME_CHARS = r'[^\w \.-]'
MAX_FILENAME_LENGTH = 255 MAX_PATH_LENGTH = 254
OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or HOME_DIR / "Nextcloud" / "notes") OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or HOME_DIR / "Nextcloud" / "notes")
OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal" OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal"
OBSIDIAN_RESOURCES_DIR = "obsidian/resources" OBSIDIAN_RESOURCES_DIR = "obsidian/resources"
@ -80,6 +80,8 @@ OBSIDIAN_BANNER_DIR = f"{OBSIDIAN_RESOURCES_DIR}/banners"
os.makedirs(Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_BANNER_DIR, exist_ok=True) os.makedirs(Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_BANNER_DIR, exist_ok=True)
OBSIDIAN_BANNER_SCENE = os.getenv("OBSIDIAN_BANNER_SCENE", "wallpaper") OBSIDIAN_BANNER_SCENE = os.getenv("OBSIDIAN_BANNER_SCENE", "wallpaper")
OBSIDIAN_CHROMADB_COLLECTION = os.getenv("OBSIDIAN_CHROMADB_COLLECTION", "obsidian") OBSIDIAN_CHROMADB_COLLECTION = os.getenv("OBSIDIAN_CHROMADB_COLLECTION", "obsidian")
ARCHIVE_DIR = Path(os.getenv("ARCHIVE_DIR", OBSIDIAN_VAULT_DIR / "archive"))
os.makedirs(ARCHIVE_DIR, exist_ok=True)
DOC_DIR = DATA_DIR / "docs" DOC_DIR = DATA_DIR / "docs"
os.makedirs(DOC_DIR, exist_ok=True) os.makedirs(DOC_DIR, exist_ok=True)

View file

@ -51,23 +51,23 @@
"inputs": { "inputs": {
"batch_size": 1, "batch_size": 1,
"width": 1023, "width": 1023,
"height": 1025, "height": 1024,
"resampling": "bicubic", "resampling": "bicubic",
"X": 0, "X": 0,
"Y": 0, "Y": 0,
"Z": 0, "Z": 0,
"evolution": 0.1, "evolution": 0.1,
"frame": 1, "frame": 1,
"scale": 13.1, "scale": 6.66,
"octaves": 8, "octaves": 8,
"persistence": 6.2, "persistence": 3,
"lacunarity": 5.38, "lacunarity": 6.66,
"exponent": 4.5600000000000005, "exponent": 1,
"brightness": -0.16, "brightness": 0,
"contrast": -0.13, "contrast": 0,
"clamp_min": 0, "clamp_min": 0,
"clamp_max": 1, "clamp_max": 1,
"seed": 474669046020372, "seed": 300432080108380,
"device": "cpu", "device": "cpu",
"optional_vae": [ "optional_vae": [
"4", "4",
@ -81,10 +81,10 @@
}, },
"13": { "13": {
"inputs": { "inputs": {
"seed": 484066073734968, "seed": 1125631171146107,
"steps": 8, "steps": 10,
"cfg": 1.8, "cfg": 1.8,
"sampler_name": "dpmpp_2m_sde", "sampler_name": "dpmpp_2s_ancestral",
"scheduler": "karras", "scheduler": "karras",
"start_at_step": 0, "start_at_step": 0,
"end_at_step": 10000, "end_at_step": 10000,
@ -197,57 +197,6 @@
"title": "CLIP Text Encode (Prompt)" "title": "CLIP Text Encode (Prompt)"
} }
}, },
"22": {
"inputs": {
"upscale_by": 2,
"seed": 589846903558615,
"steps": 20,
"cfg": 1.6,
"sampler_name": "heun",
"scheduler": "sgm_uniform",
"denoise": 0.21,
"mode_type": "Linear",
"tile_width": 512,
"tile_height": 512,
"mask_blur": 8,
"tile_padding": 32,
"seam_fix_mode": "Band Pass",
"seam_fix_denoise": 1,
"seam_fix_width": 64,
"seam_fix_mask_blur": 8,
"seam_fix_padding": 16,
"force_uniform_tiles": true,
"tiled_decode": true,
"image": [
"38",
0
],
"model": [
"4",
0
],
"positive": [
"6",
0
],
"negative": [
"23",
0
],
"vae": [
"4",
2
],
"upscale_model": [
"24",
0
]
},
"class_type": "UltimateSDUpscale",
"_meta": {
"title": "Ultimate SD Upscale"
}
},
"23": { "23": {
"inputs": { "inputs": {
"conditioning": [ "conditioning": [
@ -276,7 +225,7 @@
0 0
], ],
"image": [ "image": [
"22", "39",
0 0
] ]
}, },
@ -313,21 +262,6 @@
"title": "ImageBlur" "title": "ImageBlur"
} }
}, },
"36": {
"inputs": {
"mode": "bicubic",
"factor": 1.25,
"align": "true",
"samples": [
"13",
0
]
},
"class_type": "Latent Upscale by Factor (WAS)",
"_meta": {
"title": "Latent Upscale by Factor (WAS)"
}
},
"38": { "38": {
"inputs": { "inputs": {
"samples": [ "samples": [
@ -343,5 +277,65 @@
"_meta": { "_meta": {
"title": "VAE Decode" "title": "VAE Decode"
} }
},
"39": {
"inputs": {
"upscale_by": 2,
"seed": 687912408861107,
"steps": 20,
"cfg": 1.9000000000000001,
"sampler_name": "heun",
"scheduler": "sgm_uniform",
"denoise": 0.2,
"mode_type": "Linear",
"tile_width": 512,
"tile_height": 512,
"mask_blur": 8,
"tile_padding": 32,
"seam_fix_mode": "Band Pass",
"seam_fix_denoise": 1,
"seam_fix_width": 64,
"seam_fix_mask_blur": 8,
"seam_fix_padding": 16,
"force_uniform_tiles": true,
"tiled_decode": true,
"image": [
"38",
0
],
"model": [
"4",
0
],
"positive": [
"6",
0
],
"negative": [
"23",
0
],
"vae": [
"4",
2
],
"upscale_model": [
"40",
0
]
},
"class_type": "UltimateSDUpscale",
"_meta": {
"title": "Ultimate SD Upscale"
}
},
"40": {
"inputs": {
"model_name": "RealESRGAN_x2.pth"
},
"class_type": "UpscaleModelLoader",
"_meta": {
"title": "Load Upscale Model"
}
} }
} }

View file

@ -7,15 +7,19 @@ from io import BytesIO
from pydantic import BaseModel from pydantic import BaseModel
import os, re import os, re
import uuid import uuid
import aiohttp
import traceback import traceback
import requests import requests
import mimetypes import mimetypes
import shutil import shutil
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from typing import Optional, Union, Dict, List, Tuple from typing import Optional, Union, Dict, List, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from newspaper import Article from newspaper import Article
import trafilatura import trafilatura
from readability import Document
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
import re import re
import os import os
@ -23,10 +27,10 @@ from datetime import timedelta, datetime, time as dt_time, date as dt_date
from fastapi import HTTPException, status from fastapi import HTTPException, status
from pathlib import Path from pathlib import Path
from fastapi import APIRouter, Query, HTTPException from fastapi import APIRouter, Query, HTTPException
from sijapi import L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, BASE_URL, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, TZ from sijapi import L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, ARCHIVE_DIR, BASE_URL, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, TZ
from sijapi.routers import tts, llm, time, sd, locate, weather, asr, calendar from sijapi.routers import tts, llm, time, sd, locate, weather, asr, calendar
from sijapi.routers.locate import Location from sijapi.routers.locate import Location
from sijapi.utilities import assemble_journal_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, HOURLY_COLUMNS_MAPPING from sijapi.utilities import assemble_journal_path, assemble_archive_path, convert_to_12_hour_format, sanitize_filename, convert_degrees_to_cardinal, HOURLY_COLUMNS_MAPPING
note = APIRouter() note = APIRouter()
@ -440,9 +444,9 @@ async def parse_article(url: str, source: Optional[str] = None):
L.INFO(f"Parsed {np3k.title}") L.INFO(f"Parsed {np3k.title}")
title = np3k.title or traf.title title = (np3k.title or traf.title) or url
authors = np3k.authors or traf.author authors = np3k.authors or traf.author
authors = authors if isinstance(authors, List) else [authors] authors = (authors if isinstance(authors, List) else [authors])
date = np3k.publish_date or traf.date date = np3k.publish_date or traf.date
try: try:
date = await locate.localize_datetime(date) date = await locate.localize_datetime(date)
@ -469,6 +473,33 @@ async def parse_article(url: str, source: Optional[str] = None):
} }
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
if source:
html_content = source
elif url:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html_content = await response.text()
else:
L.ERR(f"Unable to convert nothing to markdown.")
return None
# Use readability to extract the main content
doc = Document(html_content)
cleaned_html = doc.summary()
# Parse the cleaned HTML with BeautifulSoup for any additional processing
soup = BeautifulSoup(cleaned_html, 'html.parser')
# Remove any remaining unwanted elements
for element in soup(['script', 'style']):
element.decompose()
# Convert to markdown
markdown_content = md(str(soup), heading_style="ATX")
return markdown_content
async def process_archive( async def process_archive(
background_tasks: BackgroundTasks, background_tasks: BackgroundTasks,
@ -476,59 +507,32 @@ async def process_archive(
title: Optional[str] = None, title: Optional[str] = None,
encoding: str = 'utf-8', encoding: str = 'utf-8',
source: Optional[str] = None, source: Optional[str] = None,
): ) -> Path:
timestamp = datetime.now().strftime('%b %d, %Y at %H:%M') timestamp = datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = title if title else f"{url} - {timestamp}"
parsed_content = await parse_article(url, source) content = await html_to_markdown(url, source)
if parsed_content is None: if content is None:
return {"error": "Failed to retrieve content"} raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
content = parsed_content["content"]
readable_title = sanitize_filename(title if title else parsed_content.get("title", "Untitled")) markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
if not readable_title:
readable_title = timestamp
markdown_path = OBSIDIAN_VAULT_DIR / "archive" markdown_content = f"---\n"
markdown_content += f"title: {readable_title}\n"
markdown_content += f"added: {timestamp}\n"
markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n"
markdown_content += content
try: try:
frontmatter = f"""--- markdown_path.parent.mkdir(parents=True, exist_ok=True)
title: {readable_title} with open(markdown_path, 'w', encoding=encoding) as md_file:
author: {parsed_content.get('author', 'Unknown')} md_file.write(markdown_content)
published: {parsed_content.get('date_published', 'Unknown')} L.INFO(f"Successfully saved to {markdown_path}")
added: {timestamp} return markdown_path
excerpt: {parsed_content.get('excerpt', '')}
---
"""
body = f"# {readable_title}\n\n"
try:
authors = parsed_content.get('author', '')
authors_in_brackets = [f"[[{author.strip()}]]" for author in authors.split(",")]
authors_string = ", ".join(authors_in_brackets)
body += f"by {authors_string} in [{parsed_content.get('domain', urlparse(url).netloc.replace('www.', ''))}]({parsed_content.get('url', url)}).\n\n"
body += content
markdown_content = frontmatter + body
except Exception as e:
L.ERR(f"Failed to combine elements of article markdown.")
try:
with open(markdown_path, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_path}")
add_to_daily_note
return markdown_path
except Exception as e:
L.ERR(f"Failed to write markdown file")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e: except Exception as e:
L.ERR(f"Failed to clip {url}: {str(e)}") L.ERR(f"Failed to write markdown file: {str(e)}")
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=f"Failed to write markdown file: {str(e)}")
def download_file(url, folder): def download_file(url, folder):
os.makedirs(folder, exist_ok=True) os.makedirs(folder, exist_ok=True)
@ -569,7 +573,6 @@ def copy_file(local_path, folder):
return filename return filename
async def save_file(file: UploadFile, folder: Path) -> Path: async def save_file(file: UploadFile, folder: Path) -> Path:
file_path = folder / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}" file_path = folder / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{file.filename}"
with open(file_path, 'wb') as f: with open(file_path, 'wb') as f:

View file

@ -25,7 +25,7 @@ import asyncpg
from sshtunnel import SSHTunnelForwarder from sshtunnel import SSHTunnelForwarder
from fastapi import Depends, HTTPException, Request, UploadFile from fastapi import Depends, HTTPException, Request, UploadFile
from fastapi.security.api_key import APIKeyHeader from fastapi.security.api_key import APIKeyHeader
from sijapi import L, GLOBAL_API_KEY, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_FILENAME_LENGTH from sijapi import L, GLOBAL_API_KEY, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
api_key_header = APIKeyHeader(name="Authorization") api_key_header = APIKeyHeader(name="Authorization")
@ -38,6 +38,35 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)):
raise HTTPException(status_code=401, detail="Invalid or missing API key") raise HTTPException(status_code=401, detail="Invalid or missing API key")
def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
year = date_time.strftime(YEAR_FMT)
month = date_time.strftime(MONTH_FMT)
day = date_time.strftime(DAY_FMT)
day_short = date_time.strftime(DAY_SHORT_FMT)
timestamp = date_time.strftime("%H%M%S")
# Ensure the extension is preserved
base_name, ext = os.path.splitext(filename)
extension = ext if ext else extension
# Initial sanitization
sanitized_base = sanitize_filename(base_name, '')
filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path
# Ensure the total path length doesn't exceed MAX_PATH_LENGTH
while len(str(absolute_path)) > MAX_PATH_LENGTH:
# Truncate the sanitized_base, not the full filename
sanitized_base = sanitized_base[:-1]
filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path
return absolute_path, relative_path
def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]: def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]:
''' '''
Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension. Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension.
@ -51,32 +80,22 @@ def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str
timestamp = date_time.strftime("%H%M%S") timestamp = date_time.strftime("%H%M%S")
relative_path = Path("journal") / year / month / day relative_path = Path("journal") / year / month / day
if not subdir and not filename and not extension: if not subdir and not filename and not extension:
# standard daily note handler, where only the date_time was specified:
relative_path = relative_path / f"{day}.md" relative_path = relative_path / f"{day}.md"
else: else:
if subdir: if subdir:
# datestamped subdirectory handler
relative_path = relative_path / f"{day_short} {subdir}" relative_path = relative_path / f"{day_short} {subdir}"
if filename: if filename:
filename = sanitize_filename(filename)
filename = f"{day_short} {filename}" if no_timestamp else f"{day_short} {timestamp} {filename}"
if extension: if extension:
extension = extension if extension.startswith(".") else f".{extension}" extension = extension if extension.startswith(".") else f".{extension}"
filename = f"{filename}{extension}" if not filename.endswith(extension) else filename
else: else:
if has_valid_extension(filename, [".md", ".m4a", ".wav", ".aiff", ".flac", ".mp3", ".mp4", ".pdf", ".js", ".json", ".yaml", ".py"]): extension = validate_extension(filename, [".md", ".m4a", ".wav", ".aiff", ".flac", ".mp3", ".mp4", ".pdf", ".js", ".json", ".yaml", ".py"]) or ".md"
L.DEBUG(f"Provided filename has a valid extension, so we use that.")
else:
filename = f"{filename}.md"
L.DEBUG(f"We are forcing the file to be a .md")
filename = sanitize_filename(filename)
filename = f"{day_short} {filename}" if no_timestamp else f"{day_short} {timestamp} {filename}"
filename = f"{filename}{extension}" if not filename.endswith(extension) else filename
relative_path = relative_path / filename relative_path = relative_path / filename
else: else:
@ -84,20 +103,16 @@ def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str
return None, None return None, None
absolute_path = OBSIDIAN_VAULT_DIR / relative_path absolute_path = OBSIDIAN_VAULT_DIR / relative_path
os.makedirs(absolute_path.parent, exist_ok=True) os.makedirs(absolute_path.parent, exist_ok=True)
return absolute_path, relative_path return absolute_path, relative_path
def has_valid_extension(filename, valid_extensions=None): def validate_extension(filename, valid_extensions=None):
if valid_extensions is None: if valid_extensions is None:
# Check if there's any extension return os.path.splitext(filename)
return bool(os.path.splitext(filename)[1])
else: else:
# Check if the extension is in the list of valid extensions extension = os.path.splitext(filename)[-1].lower()
return os.path.splitext(filename)[1].lower() in valid_extensions return extension if extension in valid_extensions else None
def prefix_lines(text: str, prefix: str = '> ') -> str: def prefix_lines(text: str, prefix: str = '> ') -> str:
lines = text.split('\n') lines = text.split('\n')
@ -138,7 +153,7 @@ def get_extension(file):
def sanitize_filename(text, max_length=MAX_FILENAME_LENGTH): def sanitize_filename(text, extension: str = None, max_length: int = MAX_PATH_LENGTH):
"""Sanitize a string to be used as a safe filename while protecting the file extension.""" """Sanitize a string to be used as a safe filename while protecting the file extension."""
L.DEBUG(f"Filename before sanitization: {text}") L.DEBUG(f"Filename before sanitization: {text}")
@ -149,7 +164,7 @@ def sanitize_filename(text, max_length=MAX_FILENAME_LENGTH):
max_base_length = max_length - len(extension) max_base_length = max_length - len(extension)
if len(base_name) > max_base_length: if len(base_name) > max_base_length:
base_name = base_name[:max_base_length].rstrip() base_name = base_name[:max_base_length - 5].rstrip()
final_filename = base_name + extension final_filename = base_name + extension
L.DEBUG(f"Filename after sanitization: {final_filename}") L.DEBUG(f"Filename after sanitization: {final_filename}")