From c742336b62672da5bd3faff9df70c1d5486c8883 Mon Sep 17 00:00:00 2001 From: sanj <67624670+iodrift@users.noreply.github.com> Date: Sun, 30 Jun 2024 11:08:14 -0700 Subject: [PATCH] Auto-update: Sun Jun 30 11:08:14 PDT 2024 --- sijapi/__init__.py | 20 ++- sijapi/classes.py | 209 +++++++++++++++++++------------ sijapi/config/news.yaml-example | 32 +++++ sijapi/routers/news.py | 214 +++++++++++++++++++++++++++++--- sijapi/routers/tts.py | 8 +- 5 files changed, 369 insertions(+), 114 deletions(-) create mode 100644 sijapi/config/news.yaml-example diff --git a/sijapi/__init__.py b/sijapi/__init__.py index 3ddc5a4..6d5e91e 100644 --- a/sijapi/__init__.py +++ b/sijapi/__init__.py @@ -23,21 +23,16 @@ os.makedirs(LOGS_DIR, exist_ok=True) load_dotenv(ENV_PATH) ### API essentials -API_CONFIG_PATH = CONFIG_DIR / "api.yaml" -SECRETS_PATH = CONFIG_DIR / "secrets.yaml" -API = APIConfig.load(API_CONFIG_PATH, SECRETS_PATH) -DIR_CONFIG_PATH = CONFIG_DIR / "dirs.yaml" -L.DEBUG(f"Loading DIR configuration from: {DIR_CONFIG_PATH}") -DIR = Configuration.load(DIR_CONFIG_PATH) -L.DEBUG(f"Loaded DIR configuration: {DIR.__dict__}") - -DB = Database.from_env() - +API = APIConfig.load('api', 'secrets') +Dir = Configuration.load('dirs') HOST = f"{API.BIND}:{API.PORT}" LOCAL_HOSTS = [ipaddress.ip_address(localhost.strip()) for localhost in os.getenv('LOCAL_HOSTS', '127.0.0.1').split(',')] + ['localhost'] SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255') MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count()) +DB = Database.from_env() +News = Configuration.load('news', 'secrets') +SD = Configuration.load('sd', 'secrets') ### Directories & general paths ROUTER_DIR = BASE_DIR / "routers" @@ -66,7 +61,7 @@ GEO = Geocoder(NAMED_LOCATIONS, TZ_CACHE) ### Obsidian & notes ALLOWED_FILENAME_CHARS = r'[^\w \.-]' MAX_PATH_LENGTH = 254 -OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or Path(DIR.HOME) / "Nextcloud" / "notes") +OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or Path(Dir.HOME) / "Nextcloud" / "notes") OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal" OBSIDIAN_RESOURCES_DIR = "obsidian/resources" OBSIDIAN_BANNER_DIR = f"{OBSIDIAN_RESOURCES_DIR}/banners" @@ -118,7 +113,7 @@ SD_CONFIG_PATH = CONFIG_DIR / 'sd.yaml' ### ASR ASR_DIR = DATA_DIR / "asr" os.makedirs(ASR_DIR, exist_ok=True) -WHISPER_CPP_DIR = Path(DIR.HOME) / str(os.getenv("WHISPER_CPP_DIR")) +WHISPER_CPP_DIR = Path(Dir.HOME) / str(os.getenv("WHISPER_CPP_DIR")) WHISPER_CPP_MODELS = os.getenv('WHISPER_CPP_MODELS', 'NULL,VOID').split(',') ### TTS @@ -135,6 +130,7 @@ TTS_SEGMENTS_DIR = TTS_DIR / 'segments' os.makedirs(TTS_SEGMENTS_DIR, exist_ok=True) ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") + ### Calendar & email account MS365_TOGGLE = True if os.getenv("MS365_TOGGLE") == "True" else False ICAL_TOGGLE = True if os.getenv("ICAL_TOGGLE") == "True" else False diff --git a/sijapi/classes.py b/sijapi/classes.py index e2aed2a..481cf05 100644 --- a/sijapi/classes.py +++ b/sijapi/classes.py @@ -22,6 +22,119 @@ from timezonefinder import TimezoneFinder T = TypeVar('T', bound='Configuration') + +import os +from pathlib import Path +from typing import Union, Optional, Any, Dict, List +import yaml +import re +from pydantic import BaseModel, create_model +from dotenv import load_dotenv + +class Configuration(BaseModel): + HOME: Path = Path.home() + _dir_config: Optional['Configuration'] = None + + @classmethod + def load(cls, yaml_path: Union[str, Path], secrets_path: Optional[Union[str, Path]] = None, dir_config: Optional['Configuration'] = None) -> 'Configuration': + yaml_path = cls._resolve_path(yaml_path, 'config') + if secrets_path: + secrets_path = cls._resolve_path(secrets_path, 'config') + + try: + with yaml_path.open('r') as file: + config_data = yaml.safe_load(file) + + print(f"Loaded configuration data from {yaml_path}") + + if secrets_path: + with secrets_path.open('r') as file: + secrets_data = yaml.safe_load(file) + print(f"Loaded secrets data from {secrets_path}") + config_data.update(secrets_data) + + # Ensure HOME is set + if config_data.get('HOME') is None: + config_data['HOME'] = str(Path.home()) + print(f"HOME was None in config, set to default: {config_data['HOME']}") + + load_dotenv() + + instance = cls.create_dynamic_model(**config_data) + instance._dir_config = dir_config or instance + + resolved_data = instance.resolve_placeholders(config_data) + instance = cls.create_dynamic_model(**resolved_data) + instance._dir_config = dir_config or instance + + return instance + except Exception as e: + print(f"Error loading configuration: {str(e)}") + raise + + @classmethod + def _resolve_path(cls, path: Union[str, Path], default_dir: str) -> Path: + base_path = Path(__file__).parent.parent # This will be two levels up from this file + path = Path(path) + if not path.suffix: + path = base_path / 'sijapi' / default_dir / f"{path.name}.yaml" + elif not path.is_absolute(): + path = base_path / path + return path + + def resolve_placeholders(self, data: Any) -> Any: + if isinstance(data, dict): + return {k: self.resolve_placeholders(v) for k, v in data.items()} + elif isinstance(data, list): + return [self.resolve_placeholders(v) for v in data] + elif isinstance(data, str): + return self.resolve_string_placeholders(data) + else: + return data + + def resolve_string_placeholders(self, value: str) -> Any: + pattern = r'\{\{\s*([^}]+)\s*\}\}' + matches = re.findall(pattern, value) + + for match in matches: + parts = match.split('.') + if len(parts) == 1: # Internal reference + replacement = getattr(self._dir_config, parts[0], str(Path.home() / parts[0].lower())) + elif len(parts) == 2 and parts[0] == 'Dir': + replacement = getattr(self._dir_config, parts[1], str(Path.home() / parts[1].lower())) + elif len(parts) == 2 and parts[0] == 'ENV': + replacement = os.getenv(parts[1], '') + else: + replacement = value # Keep original if not recognized + + value = value.replace('{{' + match + '}}', str(replacement)) + + # Convert to Path if it looks like a file path + if isinstance(value, str) and (value.startswith(('/', '~')) or (':' in value and value[1] == ':')): + return Path(value).expanduser() + return value + + @classmethod + def create_dynamic_model(cls, **data): + for key, value in data.items(): + if isinstance(value, dict): + data[key] = cls.create_dynamic_model(**value) + elif isinstance(value, list) and all(isinstance(item, dict) for item in value): + data[key] = [cls.create_dynamic_model(**item) for item in value] + + DynamicModel = create_model( + f'Dynamic{cls.__name__}', + __base__=cls, + **{k: (Any, v) for k, v in data.items()} + ) + return DynamicModel(**data) + + class Config: + extra = "allow" + arbitrary_types_allowed = True + + + class APIConfig(BaseModel): HOST: str PORT: int @@ -34,7 +147,10 @@ class APIConfig(BaseModel): KEYS: List[str] @classmethod - def load(cls, config_path: Path, secrets_path: Path): + def load(cls, config_path: Union[str, Path], secrets_path: Union[str, Path]): + config_path = cls._resolve_path(config_path, 'config') + secrets_path = cls._resolve_path(secrets_path, 'config') + # Load main configuration with open(config_path, 'r') as file: config_data = yaml.safe_load(file) @@ -90,6 +206,16 @@ class APIConfig(BaseModel): return cls(**config_data) + @classmethod + def _resolve_path(cls, path: Union[str, Path], default_dir: str) -> Path: + base_path = Path(__file__).parent.parent # This will be two levels up from this file + path = Path(path) + if not path.suffix: + path = base_path / "sijapi" / default_dir / f"{path.name}.yaml" + elif not path.is_absolute(): + path = base_path / path + return path + @classmethod def resolve_placeholders(cls, config_data: Dict[str, Any]) -> Dict[str, Any]: def resolve_value(value): @@ -127,87 +253,6 @@ class APIConfig(BaseModel): return [module for module, is_active in self.MODULES.__dict__.items() if is_active] -class Configuration(BaseModel): - HOME: Path = Path.home() - _dir_config: Optional['Configuration'] = None - - @classmethod - def load(cls, yaml_path: Union[str, Path], dir_config: Optional['Configuration'] = None) -> 'Configuration': - yaml_path = Path(yaml_path) - try: - with yaml_path.open('r') as file: - config_data = yaml.safe_load(file) - - print(f"Loaded configuration data: {config_data}") - - # Ensure HOME is set - if config_data.get('HOME') is None: - config_data['HOME'] = str(Path.home()) - print(f"HOME was None in config, set to default: {config_data['HOME']}") - - load_dotenv() - - instance = cls.create_dynamic_model(**config_data) - instance._dir_config = dir_config or instance - - resolved_data = instance.resolve_placeholders(config_data) - for key, value in resolved_data.items(): - setattr(instance, key, value) - - return instance - except Exception as e: - print(f"Error loading configuration from {yaml_path}: {str(e)}") - raise - - def resolve_placeholders(self, data: Any) -> Any: - if isinstance(data, dict): - return {k: self.resolve_placeholders(v) for k, v in data.items()} - elif isinstance(data, list): - return [self.resolve_placeholders(v) for v in data] - elif isinstance(data, str): - return self.resolve_string_placeholders(data) - else: - return data - - def resolve_string_placeholders(self, value: str) -> Any: - pattern = r'\{\{\s*([^}]+)\s*\}\}' - matches = re.findall(pattern, value) - - for match in matches: - parts = match.split('.') - if len(parts) == 1: # Internal reference - replacement = getattr(self._dir_config, parts[0], str(Path.home() / parts[0].lower())) - elif len(parts) == 2 and parts[0] == 'DIR': - replacement = getattr(self._dir_config, parts[1], str(Path.home() / parts[1].lower())) - elif len(parts) == 2 and parts[0] == 'ENV': - replacement = os.getenv(parts[1], '') - else: - replacement = value # Keep original if not recognized - - value = value.replace('{{' + match + '}}', str(replacement)) - - # Convert to Path if it looks like a file path - if isinstance(value, str) and (value.startswith(('/', '~')) or (':' in value and value[1] == ':')): - return Path(value).expanduser() - return value - - @classmethod - def create_dynamic_model(cls, **data): - for key, value in data.items(): - if isinstance(value, dict): - data[key] = cls.create_dynamic_model(**value) - - DynamicModel = create_model( - f'Dynamic{cls.__name__}', - __base__=cls, - **{k: (type(v), v) for k, v in data.items()} - ) - return DynamicModel(**data) - - class Config: - extra = "allow" - arbitrary_types_allowed = True - class Location(BaseModel): latitude: float diff --git a/sijapi/config/news.yaml-example b/sijapi/config/news.yaml-example new file mode 100644 index 0000000..34281ff --- /dev/null +++ b/sijapi/config/news.yaml-example @@ -0,0 +1,32 @@ +sites: + - name: The Intercept + url: https://theintercept.com + max_articles: 5 + days_back: 14 + summarize: True + tts: off + tts_voice: Kiel + podcast: True + - name: The New York Times + url: https://www.nytimes.com + max_articles: 10 + days_back: 7 + summarize: True + tts: off + tts_voice: Luna + podcast: True + - name: The Guardian + url: https://theguardian.com + max_articles: 10 + days_back: 7 + summarize: True + tts: off + tts_voice: Attenborough + podcast: True +llm: + model: llama3 +tts: + model: elevenlabs-v2 + voice: Luna + podcast: True + diff --git a/sijapi/routers/news.py b/sijapi/routers/news.py index 096a351..9d7c5d9 100644 --- a/sijapi/routers/news.py +++ b/sijapi/routers/news.py @@ -1,32 +1,214 @@ -from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath -from fastapi.responses import JSONResponse -from zoneinfo import ZoneInfo -from io import BytesIO -from pydantic import BaseModel -from bs4 import BeautifulSoup -import requests -from markdownify import markdownify as md import os -import mimetypes -from datetime import datetime as dt_datetime -import shutil import uuid -import aiohttp -from pathlib import Path +import asyncio +import shutil +import requests +import mimetypes +from io import BytesIO +from bs4 import BeautifulSoup +from zoneinfo import ZoneInfo from urllib.parse import urlparse -from urllib3.util.retry import Retry +from datetime import datetime as dt_datetime, timedelta from typing import Optional + +import aiohttp +import aiofiles import newspaper -from newspaper import Article import trafilatura from readability import Document +from markdownify import markdownify as md from requests.adapters import HTTPAdapter -from sijapi import API, L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO +from urllib3.util.retry import Retry + +from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath +from fastapi.responses import JSONResponse +from pydantic import BaseModel + +from pathlib import Path + +from sijapi.classes import Configuration +from sijapi import API, L, Dir, News, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path from sijapi.routers import llm, tts, asr, loc +from newspaper import Article + + + news = APIRouter() +async def download_and_save_article(article, site_name, earliest_date, bg_tasks: BackgroundTasks, tts_mode: str = "summary", voice: str = DEFAULT_11L_VOICE): + try: + url = article.url + source = trafilatura.fetch_url(url) + + if source is None: + # Fallback to newspaper3k if trafilatura fails + article.download() + article.parse() + traf = None + else: + traf = trafilatura.extract_metadata(filecontent=source, default_url=url) + article.download() + article.parse() + + # Update article properties, preferring trafilatura data when available + article.title = traf.title if traf and traf.title else article.title or url + article.authors = traf.author if traf and traf.author else article.authors or [] + article.publish_date = traf.date if traf and traf.date else article.publish_date + try: + article.publish_date = await loc.dt(article.publish_date, "UTC") + except: + L.DEBUG(f"Failed to localize {article.publish_date}") + article.publish_date = await loc.dt(dt_datetime.now(), "UTC") + article.meta_description = traf.description if traf and traf.description else article.meta_description + article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) if source else article.text + article.top_image = traf.image if traf and traf.image else article.top_image + article.source_url = traf.sitename if traf and traf.sitename else urlparse(url).netloc.replace('www.', '').title() + article.meta_keywords = traf.categories or traf.tags if traf else article.meta_keywords or [] + article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords] + + if not is_article_within_date_range(article, earliest_date): + return False + + + timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') + readable_title = sanitize_filename(article.title or timestamp) + markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md") + + summary = await llm.summarize_text(article.text, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.") + summary = summary.replace('\n', ' ') # Remove line breaks + + if tts_mode == "full" or tts_mode == "content": + tts_text = article.text + elif tts_mode == "summary" or tts_mode == "excerpt": + tts_text = summary + else: + tts_text = None + + banner_markdown = '' + try: + banner_url = article.top_image + if banner_url: + banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR)) + if banner_image: + banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]" + except Exception as e: + L.ERR(f"No image found in article") + + + authors = ', '.join(['[[{}]]'.format(author.strip()) for author in article.authors if author.strip()]) + if not authors: + authors = '[[Unknown Author]]' + + frontmatter = f"""--- +title: {readable_title} +authors: {authors} +published: {article.publish_date} +added: {timestamp} +banner: "{banner_markdown}" +tags: +""" + frontmatter += '\n'.join(f" - {tag}" for tag in article.meta_keywords) + frontmatter += '\n---\n' + + body = f"# {readable_title}\n\n" + if tts_text: + audio_filename = f"{article.publish_date.strftime('%Y-%m-%d')} {readable_title}" + try: + audio_path = await tts.generate_speech( + bg_tasks=bg_tasks, + text=tts_text, + voice=voice, + model="eleven_turbo_v2", + podcast=True, + title=audio_filename, + output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR + ) + if isinstance(audio_path, Path): + audio_ext = audio_path.suffix + obsidian_link = f"![[{audio_path.name}]]" + body += f"{obsidian_link}\n\n" + else: + L.WARN(f"Unexpected audio_path type: {type(audio_path)}. Value: {audio_path}") + except Exception as e: + L.ERR(f"Failed to generate TTS for {audio_filename}. Error: {str(e)}") + L.ERR(f"TTS error details - voice: {voice}, model: eleven_turbo_v2, podcast: True") + L.ERR(f"Output directory: {Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR}") + + body += f"by {authors} in {article.source_url}\n\n" + body += f"> [!summary]+\n" + body += f"> {summary}\n\n" + body += article.text + + markdown_content = frontmatter + body + + with open(markdown_filename, 'w') as md_file: + md_file.write(markdown_content) + + L.INFO(f"Successfully saved to {markdown_filename}") + add_to_daily_note(relative_path) + print(f"Saved article: {relative_path}") + return True + + + except Exception as e: + L.ERR(f"Error processing article from {article.url}: {str(e)}") + return False + +# You'll need to update your is_article_within_date_range function: +def is_article_within_date_range(article, earliest_date): + return article.publish_date is not None and article.publish_date.date() >= earliest_date + +async def process_news_site(site, bg_tasks: BackgroundTasks): + print(f"Downloading articles from {site.name}...") + + earliest_date = dt_datetime.now().date() - timedelta(days=site.days_back) + + try: + news_source = newspaper.build(site.url, memoize_articles=False) + + tasks = [] + for article in news_source.articles[:site.max_articles]: + task = asyncio.create_task(download_and_save_article( + article, + site.name, + earliest_date, + bg_tasks, + tts_mode=site.tts if hasattr(site, 'tts') else "off", + voice=site.tts if hasattr(site, 'tts') else DEFAULT_11L_VOICE + )) + tasks.append(task) + + results = await asyncio.gather(*tasks) + articles_downloaded = sum(results) + + print(f"Downloaded {articles_downloaded} articles from {site.name}") + except Exception as e: + print(f"Error processing {site.name}: {str(e)}") + +# Update your news_refresh_endpoint function: +@news.get("/news/refresh") +async def news_refresh_endpoint(bg_tasks: BackgroundTasks): + tasks = [process_news_site(site, bg_tasks) for site in News.sites] + await asyncio.gather(*tasks) + return "OK" + + +async def generate_path(article, site_name): + publish_date = await loc.dt(article.publish_date, 'UTC') if article.publish_date else await loc.dt(dt_datetime.now(), 'UTC') + title_slug = "".join(c if c.isalnum() else "_" for c in article.title) + filename = f"{site_name} - {title_slug[:50]}.md" + absolute_path, relative_path = assemble_journal_path(publish_date, 'Articles', filename, extension='.md', no_timestamp=True) + return absolute_path, relative_path + +async def save_article_to_file(content, output_path): + output_path.parent.mkdir(parents=True, exist_ok=True) + async with aiofiles.open(output_path, 'w', encoding='utf-8') as file: + await file.write(content) + + + ### CLIPPER ### @news.post("/clip") async def clip_post( diff --git a/sijapi/routers/tts.py b/sijapi/routers/tts.py index 92f1912..b5b8dc0 100644 --- a/sijapi/routers/tts.py +++ b/sijapi/routers/tts.py @@ -143,7 +143,7 @@ async def generate_speech( # raise HTTPException(status_code=400, detail="Invalid model specified") if podcast == True: - podcast_path = PODCAST_DIR / audio_file_path.name + podcast_path = Path(PODCAST_DIR) / audio_file_path.name L.DEBUG(f"Podcast path: {podcast_path}") shutil.copy(str(audio_file_path), str(podcast_path)) bg_tasks.add_task(os.remove, str(audio_file_path)) @@ -152,7 +152,7 @@ async def generate_speech( return str(audio_file_path) except Exception as e: - L.ERROR(f"Failed to generate speech: {str(e)}") + L.ERR(f"Failed to generate speech: {str(e)}") raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}") @@ -331,7 +331,7 @@ async def local_tts( # Export the combined audio in a separate thread if podcast: - podcast_file_path = PODCAST_DIR / file_path.name + podcast_file_path = Path(PODCAST_DIR) / file_path.name await asyncio.to_thread(combined_audio.export, podcast_file_path, format="wav") await asyncio.to_thread(combined_audio.export, file_path, format="wav") @@ -425,7 +425,7 @@ def copy_to_podcast_dir(file_path): file_name = Path(file_path).name # Construct the destination path in the PODCAST_DIR - destination_path = PODCAST_DIR / file_name + destination_path = Path(PODCAST_DIR) / file_name # Copy the file to the PODCAST_DIR shutil.copy(file_path, destination_path)