Auto-update: Sun Jun 30 11:08:14 PDT 2024

This commit is contained in:
sanj 2024-06-30 11:08:14 -07:00
parent c9dc619a5a
commit c742336b62
5 changed files with 369 additions and 114 deletions

View file

@ -23,21 +23,16 @@ os.makedirs(LOGS_DIR, exist_ok=True)
load_dotenv(ENV_PATH)
### API essentials
API_CONFIG_PATH = CONFIG_DIR / "api.yaml"
SECRETS_PATH = CONFIG_DIR / "secrets.yaml"
API = APIConfig.load(API_CONFIG_PATH, SECRETS_PATH)
DIR_CONFIG_PATH = CONFIG_DIR / "dirs.yaml"
L.DEBUG(f"Loading DIR configuration from: {DIR_CONFIG_PATH}")
DIR = Configuration.load(DIR_CONFIG_PATH)
L.DEBUG(f"Loaded DIR configuration: {DIR.__dict__}")
DB = Database.from_env()
API = APIConfig.load('api', 'secrets')
Dir = Configuration.load('dirs')
HOST = f"{API.BIND}:{API.PORT}"
LOCAL_HOSTS = [ipaddress.ip_address(localhost.strip()) for localhost in os.getenv('LOCAL_HOSTS', '127.0.0.1').split(',')] + ['localhost']
SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255')
MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count())
DB = Database.from_env()
News = Configuration.load('news', 'secrets')
SD = Configuration.load('sd', 'secrets')
### Directories & general paths
ROUTER_DIR = BASE_DIR / "routers"
@ -66,7 +61,7 @@ GEO = Geocoder(NAMED_LOCATIONS, TZ_CACHE)
### Obsidian & notes
ALLOWED_FILENAME_CHARS = r'[^\w \.-]'
MAX_PATH_LENGTH = 254
OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or Path(DIR.HOME) / "Nextcloud" / "notes")
OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or Path(Dir.HOME) / "Nextcloud" / "notes")
OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal"
OBSIDIAN_RESOURCES_DIR = "obsidian/resources"
OBSIDIAN_BANNER_DIR = f"{OBSIDIAN_RESOURCES_DIR}/banners"
@ -118,7 +113,7 @@ SD_CONFIG_PATH = CONFIG_DIR / 'sd.yaml'
### ASR
ASR_DIR = DATA_DIR / "asr"
os.makedirs(ASR_DIR, exist_ok=True)
WHISPER_CPP_DIR = Path(DIR.HOME) / str(os.getenv("WHISPER_CPP_DIR"))
WHISPER_CPP_DIR = Path(Dir.HOME) / str(os.getenv("WHISPER_CPP_DIR"))
WHISPER_CPP_MODELS = os.getenv('WHISPER_CPP_MODELS', 'NULL,VOID').split(',')
### TTS
@ -135,6 +130,7 @@ TTS_SEGMENTS_DIR = TTS_DIR / 'segments'
os.makedirs(TTS_SEGMENTS_DIR, exist_ok=True)
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
### Calendar & email account
MS365_TOGGLE = True if os.getenv("MS365_TOGGLE") == "True" else False
ICAL_TOGGLE = True if os.getenv("ICAL_TOGGLE") == "True" else False

View file

@ -22,6 +22,119 @@ from timezonefinder import TimezoneFinder
T = TypeVar('T', bound='Configuration')
import os
from pathlib import Path
from typing import Union, Optional, Any, Dict, List
import yaml
import re
from pydantic import BaseModel, create_model
from dotenv import load_dotenv
class Configuration(BaseModel):
HOME: Path = Path.home()
_dir_config: Optional['Configuration'] = None
@classmethod
def load(cls, yaml_path: Union[str, Path], secrets_path: Optional[Union[str, Path]] = None, dir_config: Optional['Configuration'] = None) -> 'Configuration':
yaml_path = cls._resolve_path(yaml_path, 'config')
if secrets_path:
secrets_path = cls._resolve_path(secrets_path, 'config')
try:
with yaml_path.open('r') as file:
config_data = yaml.safe_load(file)
print(f"Loaded configuration data from {yaml_path}")
if secrets_path:
with secrets_path.open('r') as file:
secrets_data = yaml.safe_load(file)
print(f"Loaded secrets data from {secrets_path}")
config_data.update(secrets_data)
# Ensure HOME is set
if config_data.get('HOME') is None:
config_data['HOME'] = str(Path.home())
print(f"HOME was None in config, set to default: {config_data['HOME']}")
load_dotenv()
instance = cls.create_dynamic_model(**config_data)
instance._dir_config = dir_config or instance
resolved_data = instance.resolve_placeholders(config_data)
instance = cls.create_dynamic_model(**resolved_data)
instance._dir_config = dir_config or instance
return instance
except Exception as e:
print(f"Error loading configuration: {str(e)}")
raise
@classmethod
def _resolve_path(cls, path: Union[str, Path], default_dir: str) -> Path:
base_path = Path(__file__).parent.parent # This will be two levels up from this file
path = Path(path)
if not path.suffix:
path = base_path / 'sijapi' / default_dir / f"{path.name}.yaml"
elif not path.is_absolute():
path = base_path / path
return path
def resolve_placeholders(self, data: Any) -> Any:
if isinstance(data, dict):
return {k: self.resolve_placeholders(v) for k, v in data.items()}
elif isinstance(data, list):
return [self.resolve_placeholders(v) for v in data]
elif isinstance(data, str):
return self.resolve_string_placeholders(data)
else:
return data
def resolve_string_placeholders(self, value: str) -> Any:
pattern = r'\{\{\s*([^}]+)\s*\}\}'
matches = re.findall(pattern, value)
for match in matches:
parts = match.split('.')
if len(parts) == 1: # Internal reference
replacement = getattr(self._dir_config, parts[0], str(Path.home() / parts[0].lower()))
elif len(parts) == 2 and parts[0] == 'Dir':
replacement = getattr(self._dir_config, parts[1], str(Path.home() / parts[1].lower()))
elif len(parts) == 2 and parts[0] == 'ENV':
replacement = os.getenv(parts[1], '')
else:
replacement = value # Keep original if not recognized
value = value.replace('{{' + match + '}}', str(replacement))
# Convert to Path if it looks like a file path
if isinstance(value, str) and (value.startswith(('/', '~')) or (':' in value and value[1] == ':')):
return Path(value).expanduser()
return value
@classmethod
def create_dynamic_model(cls, **data):
for key, value in data.items():
if isinstance(value, dict):
data[key] = cls.create_dynamic_model(**value)
elif isinstance(value, list) and all(isinstance(item, dict) for item in value):
data[key] = [cls.create_dynamic_model(**item) for item in value]
DynamicModel = create_model(
f'Dynamic{cls.__name__}',
__base__=cls,
**{k: (Any, v) for k, v in data.items()}
)
return DynamicModel(**data)
class Config:
extra = "allow"
arbitrary_types_allowed = True
class APIConfig(BaseModel):
HOST: str
PORT: int
@ -34,7 +147,10 @@ class APIConfig(BaseModel):
KEYS: List[str]
@classmethod
def load(cls, config_path: Path, secrets_path: Path):
def load(cls, config_path: Union[str, Path], secrets_path: Union[str, Path]):
config_path = cls._resolve_path(config_path, 'config')
secrets_path = cls._resolve_path(secrets_path, 'config')
# Load main configuration
with open(config_path, 'r') as file:
config_data = yaml.safe_load(file)
@ -90,6 +206,16 @@ class APIConfig(BaseModel):
return cls(**config_data)
@classmethod
def _resolve_path(cls, path: Union[str, Path], default_dir: str) -> Path:
base_path = Path(__file__).parent.parent # This will be two levels up from this file
path = Path(path)
if not path.suffix:
path = base_path / "sijapi" / default_dir / f"{path.name}.yaml"
elif not path.is_absolute():
path = base_path / path
return path
@classmethod
def resolve_placeholders(cls, config_data: Dict[str, Any]) -> Dict[str, Any]:
def resolve_value(value):
@ -127,87 +253,6 @@ class APIConfig(BaseModel):
return [module for module, is_active in self.MODULES.__dict__.items() if is_active]
class Configuration(BaseModel):
HOME: Path = Path.home()
_dir_config: Optional['Configuration'] = None
@classmethod
def load(cls, yaml_path: Union[str, Path], dir_config: Optional['Configuration'] = None) -> 'Configuration':
yaml_path = Path(yaml_path)
try:
with yaml_path.open('r') as file:
config_data = yaml.safe_load(file)
print(f"Loaded configuration data: {config_data}")
# Ensure HOME is set
if config_data.get('HOME') is None:
config_data['HOME'] = str(Path.home())
print(f"HOME was None in config, set to default: {config_data['HOME']}")
load_dotenv()
instance = cls.create_dynamic_model(**config_data)
instance._dir_config = dir_config or instance
resolved_data = instance.resolve_placeholders(config_data)
for key, value in resolved_data.items():
setattr(instance, key, value)
return instance
except Exception as e:
print(f"Error loading configuration from {yaml_path}: {str(e)}")
raise
def resolve_placeholders(self, data: Any) -> Any:
if isinstance(data, dict):
return {k: self.resolve_placeholders(v) for k, v in data.items()}
elif isinstance(data, list):
return [self.resolve_placeholders(v) for v in data]
elif isinstance(data, str):
return self.resolve_string_placeholders(data)
else:
return data
def resolve_string_placeholders(self, value: str) -> Any:
pattern = r'\{\{\s*([^}]+)\s*\}\}'
matches = re.findall(pattern, value)
for match in matches:
parts = match.split('.')
if len(parts) == 1: # Internal reference
replacement = getattr(self._dir_config, parts[0], str(Path.home() / parts[0].lower()))
elif len(parts) == 2 and parts[0] == 'DIR':
replacement = getattr(self._dir_config, parts[1], str(Path.home() / parts[1].lower()))
elif len(parts) == 2 and parts[0] == 'ENV':
replacement = os.getenv(parts[1], '')
else:
replacement = value # Keep original if not recognized
value = value.replace('{{' + match + '}}', str(replacement))
# Convert to Path if it looks like a file path
if isinstance(value, str) and (value.startswith(('/', '~')) or (':' in value and value[1] == ':')):
return Path(value).expanduser()
return value
@classmethod
def create_dynamic_model(cls, **data):
for key, value in data.items():
if isinstance(value, dict):
data[key] = cls.create_dynamic_model(**value)
DynamicModel = create_model(
f'Dynamic{cls.__name__}',
__base__=cls,
**{k: (type(v), v) for k, v in data.items()}
)
return DynamicModel(**data)
class Config:
extra = "allow"
arbitrary_types_allowed = True
class Location(BaseModel):
latitude: float

View file

@ -0,0 +1,32 @@
sites:
- name: The Intercept
url: https://theintercept.com
max_articles: 5
days_back: 14
summarize: True
tts: off
tts_voice: Kiel
podcast: True
- name: The New York Times
url: https://www.nytimes.com
max_articles: 10
days_back: 7
summarize: True
tts: off
tts_voice: Luna
podcast: True
- name: The Guardian
url: https://theguardian.com
max_articles: 10
days_back: 7
summarize: True
tts: off
tts_voice: Attenborough
podcast: True
llm:
model: llama3
tts:
model: elevenlabs-v2
voice: Luna
podcast: True

View file

@ -1,32 +1,214 @@
from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
from fastapi.responses import JSONResponse
from zoneinfo import ZoneInfo
from io import BytesIO
from pydantic import BaseModel
from bs4 import BeautifulSoup
import requests
from markdownify import markdownify as md
import os
import mimetypes
from datetime import datetime as dt_datetime
import shutil
import uuid
import aiohttp
from pathlib import Path
import asyncio
import shutil
import requests
import mimetypes
from io import BytesIO
from bs4 import BeautifulSoup
from zoneinfo import ZoneInfo
from urllib.parse import urlparse
from urllib3.util.retry import Retry
from datetime import datetime as dt_datetime, timedelta
from typing import Optional
import aiohttp
import aiofiles
import newspaper
from newspaper import Article
import trafilatura
from readability import Document
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter
from sijapi import API, L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO
from urllib3.util.retry import Retry
from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from pathlib import Path
from sijapi.classes import Configuration
from sijapi import API, L, Dir, News, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path
from sijapi.routers import llm, tts, asr, loc
from newspaper import Article
news = APIRouter()
async def download_and_save_article(article, site_name, earliest_date, bg_tasks: BackgroundTasks, tts_mode: str = "summary", voice: str = DEFAULT_11L_VOICE):
try:
url = article.url
source = trafilatura.fetch_url(url)
if source is None:
# Fallback to newspaper3k if trafilatura fails
article.download()
article.parse()
traf = None
else:
traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
article.download()
article.parse()
# Update article properties, preferring trafilatura data when available
article.title = traf.title if traf and traf.title else article.title or url
article.authors = traf.author if traf and traf.author else article.authors or []
article.publish_date = traf.date if traf and traf.date else article.publish_date
try:
article.publish_date = await loc.dt(article.publish_date, "UTC")
except:
L.DEBUG(f"Failed to localize {article.publish_date}")
article.publish_date = await loc.dt(dt_datetime.now(), "UTC")
article.meta_description = traf.description if traf and traf.description else article.meta_description
article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) if source else article.text
article.top_image = traf.image if traf and traf.image else article.top_image
article.source_url = traf.sitename if traf and traf.sitename else urlparse(url).netloc.replace('www.', '').title()
article.meta_keywords = traf.categories or traf.tags if traf else article.meta_keywords or []
article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords]
if not is_article_within_date_range(article, earliest_date):
return False
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = sanitize_filename(article.title or timestamp)
markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
summary = await llm.summarize_text(article.text, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
summary = summary.replace('\n', ' ') # Remove line breaks
if tts_mode == "full" or tts_mode == "content":
tts_text = article.text
elif tts_mode == "summary" or tts_mode == "excerpt":
tts_text = summary
else:
tts_text = None
banner_markdown = ''
try:
banner_url = article.top_image
if banner_url:
banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
if banner_image:
banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
except Exception as e:
L.ERR(f"No image found in article")
authors = ', '.join(['[[{}]]'.format(author.strip()) for author in article.authors if author.strip()])
if not authors:
authors = '[[Unknown Author]]'
frontmatter = f"""---
title: {readable_title}
authors: {authors}
published: {article.publish_date}
added: {timestamp}
banner: "{banner_markdown}"
tags:
"""
frontmatter += '\n'.join(f" - {tag}" for tag in article.meta_keywords)
frontmatter += '\n---\n'
body = f"# {readable_title}\n\n"
if tts_text:
audio_filename = f"{article.publish_date.strftime('%Y-%m-%d')} {readable_title}"
try:
audio_path = await tts.generate_speech(
bg_tasks=bg_tasks,
text=tts_text,
voice=voice,
model="eleven_turbo_v2",
podcast=True,
title=audio_filename,
output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR
)
if isinstance(audio_path, Path):
audio_ext = audio_path.suffix
obsidian_link = f"![[{audio_path.name}]]"
body += f"{obsidian_link}\n\n"
else:
L.WARN(f"Unexpected audio_path type: {type(audio_path)}. Value: {audio_path}")
except Exception as e:
L.ERR(f"Failed to generate TTS for {audio_filename}. Error: {str(e)}")
L.ERR(f"TTS error details - voice: {voice}, model: eleven_turbo_v2, podcast: True")
L.ERR(f"Output directory: {Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR}")
body += f"by {authors} in {article.source_url}\n\n"
body += f"> [!summary]+\n"
body += f"> {summary}\n\n"
body += article.text
markdown_content = frontmatter + body
with open(markdown_filename, 'w') as md_file:
md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_filename}")
add_to_daily_note(relative_path)
print(f"Saved article: {relative_path}")
return True
except Exception as e:
L.ERR(f"Error processing article from {article.url}: {str(e)}")
return False
# You'll need to update your is_article_within_date_range function:
def is_article_within_date_range(article, earliest_date):
return article.publish_date is not None and article.publish_date.date() >= earliest_date
async def process_news_site(site, bg_tasks: BackgroundTasks):
print(f"Downloading articles from {site.name}...")
earliest_date = dt_datetime.now().date() - timedelta(days=site.days_back)
try:
news_source = newspaper.build(site.url, memoize_articles=False)
tasks = []
for article in news_source.articles[:site.max_articles]:
task = asyncio.create_task(download_and_save_article(
article,
site.name,
earliest_date,
bg_tasks,
tts_mode=site.tts if hasattr(site, 'tts') else "off",
voice=site.tts if hasattr(site, 'tts') else DEFAULT_11L_VOICE
))
tasks.append(task)
results = await asyncio.gather(*tasks)
articles_downloaded = sum(results)
print(f"Downloaded {articles_downloaded} articles from {site.name}")
except Exception as e:
print(f"Error processing {site.name}: {str(e)}")
# Update your news_refresh_endpoint function:
@news.get("/news/refresh")
async def news_refresh_endpoint(bg_tasks: BackgroundTasks):
tasks = [process_news_site(site, bg_tasks) for site in News.sites]
await asyncio.gather(*tasks)
return "OK"
async def generate_path(article, site_name):
publish_date = await loc.dt(article.publish_date, 'UTC') if article.publish_date else await loc.dt(dt_datetime.now(), 'UTC')
title_slug = "".join(c if c.isalnum() else "_" for c in article.title)
filename = f"{site_name} - {title_slug[:50]}.md"
absolute_path, relative_path = assemble_journal_path(publish_date, 'Articles', filename, extension='.md', no_timestamp=True)
return absolute_path, relative_path
async def save_article_to_file(content, output_path):
output_path.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(output_path, 'w', encoding='utf-8') as file:
await file.write(content)
### CLIPPER ###
@news.post("/clip")
async def clip_post(

View file

@ -143,7 +143,7 @@ async def generate_speech(
# raise HTTPException(status_code=400, detail="Invalid model specified")
if podcast == True:
podcast_path = PODCAST_DIR / audio_file_path.name
podcast_path = Path(PODCAST_DIR) / audio_file_path.name
L.DEBUG(f"Podcast path: {podcast_path}")
shutil.copy(str(audio_file_path), str(podcast_path))
bg_tasks.add_task(os.remove, str(audio_file_path))
@ -152,7 +152,7 @@ async def generate_speech(
return str(audio_file_path)
except Exception as e:
L.ERROR(f"Failed to generate speech: {str(e)}")
L.ERR(f"Failed to generate speech: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
@ -331,7 +331,7 @@ async def local_tts(
# Export the combined audio in a separate thread
if podcast:
podcast_file_path = PODCAST_DIR / file_path.name
podcast_file_path = Path(PODCAST_DIR) / file_path.name
await asyncio.to_thread(combined_audio.export, podcast_file_path, format="wav")
await asyncio.to_thread(combined_audio.export, file_path, format="wav")
@ -425,7 +425,7 @@ def copy_to_podcast_dir(file_path):
file_name = Path(file_path).name
# Construct the destination path in the PODCAST_DIR
destination_path = PODCAST_DIR / file_name
destination_path = Path(PODCAST_DIR) / file_name
# Copy the file to the PODCAST_DIR
shutil.copy(file_path, destination_path)