Auto-update: Sun Jun 30 11:08:14 PDT 2024

2024-06-30 11:08:14 -07:00 · 2024-06-30 11:08:14 -07:00 · c742336b62
commit c742336b62
parent c9dc619a5a
5 changed files with 369 additions and 114 deletions
--- a/sijapi/init.py
+++ b/sijapi/init.py
@ -23,21 +23,16 @@ os.makedirs(LOGS_DIR, exist_ok=True)
 load_dotenv(ENV_PATH)

 ### API essentials
-API_CONFIG_PATH = CONFIG_DIR / "api.yaml"
-SECRETS_PATH = CONFIG_DIR / "secrets.yaml"
-API = APIConfig.load(API_CONFIG_PATH, SECRETS_PATH)
-DIR_CONFIG_PATH = CONFIG_DIR / "dirs.yaml"
-L.DEBUG(f"Loading DIR configuration from: {DIR_CONFIG_PATH}")
-DIR = Configuration.load(DIR_CONFIG_PATH)
-L.DEBUG(f"Loaded DIR configuration: {DIR.__dict__}")
-
-DB = Database.from_env()
-
+API = APIConfig.load('api', 'secrets')
+Dir = Configuration.load('dirs')
 HOST = f"{API.BIND}:{API.PORT}" 
 LOCAL_HOSTS = [ipaddress.ip_address(localhost.strip()) for localhost in os.getenv('LOCAL_HOSTS', '127.0.0.1').split(',')] + ['localhost']
 SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255')
 MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count())
+DB = Database.from_env()

+News = Configuration.load('news', 'secrets')
+SD = Configuration.load('sd', 'secrets')

 ### Directories & general paths
 ROUTER_DIR = BASE_DIR / "routers"
@ -66,7 +61,7 @@ GEO = Geocoder(NAMED_LOCATIONS, TZ_CACHE)
 ### Obsidian & notes
 ALLOWED_FILENAME_CHARS = r'[^\w \.-]'
 MAX_PATH_LENGTH = 254
-OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or Path(DIR.HOME) / "Nextcloud" / "notes")
+OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or Path(Dir.HOME) / "Nextcloud" / "notes")
 OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal"
 OBSIDIAN_RESOURCES_DIR = "obsidian/resources"
 OBSIDIAN_BANNER_DIR = f"{OBSIDIAN_RESOURCES_DIR}/banners"
@ -118,7 +113,7 @@ SD_CONFIG_PATH = CONFIG_DIR / 'sd.yaml'
 ### ASR
 ASR_DIR = DATA_DIR / "asr"
 os.makedirs(ASR_DIR, exist_ok=True)
-WHISPER_CPP_DIR = Path(DIR.HOME) / str(os.getenv("WHISPER_CPP_DIR"))
+WHISPER_CPP_DIR = Path(Dir.HOME) / str(os.getenv("WHISPER_CPP_DIR"))
 WHISPER_CPP_MODELS = os.getenv('WHISPER_CPP_MODELS', 'NULL,VOID').split(',')

 ### TTS
@ -135,6 +130,7 @@ TTS_SEGMENTS_DIR = TTS_DIR / 'segments'
 os.makedirs(TTS_SEGMENTS_DIR, exist_ok=True)
 ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")

+
 ### Calendar & email account
 MS365_TOGGLE = True if os.getenv("MS365_TOGGLE") == "True" else False
 ICAL_TOGGLE = True if os.getenv("ICAL_TOGGLE") == "True" else False
--- a/sijapi/classes.py
+++ b/sijapi/classes.py
@ -22,6 +22,119 @@ from timezonefinder import TimezoneFinder
 T = TypeVar('T', bound='Configuration')


+
+import os
+from pathlib import Path
+from typing import Union, Optional, Any, Dict, List
+import yaml
+import re
+from pydantic import BaseModel, create_model
+from dotenv import load_dotenv
+
+class Configuration(BaseModel):
+    HOME: Path = Path.home()
+    _dir_config: Optional['Configuration'] = None
+
+    @classmethod
+    def load(cls, yaml_path: Union[str, Path], secrets_path: Optional[Union[str, Path]] = None, dir_config: Optional['Configuration'] = None) -> 'Configuration':
+        yaml_path = cls._resolve_path(yaml_path, 'config')
+        if secrets_path:
+            secrets_path = cls._resolve_path(secrets_path, 'config')
+        
+        try:
+            with yaml_path.open('r') as file:
+                config_data = yaml.safe_load(file)
+            
+            print(f"Loaded configuration data from {yaml_path}")
+            
+            if secrets_path:
+                with secrets_path.open('r') as file:
+                    secrets_data = yaml.safe_load(file)
+                print(f"Loaded secrets data from {secrets_path}")
+                config_data.update(secrets_data)
+            
+            # Ensure HOME is set
+            if config_data.get('HOME') is None:
+                config_data['HOME'] = str(Path.home())
+                print(f"HOME was None in config, set to default: {config_data['HOME']}")
+            
+            load_dotenv()
+            
+            instance = cls.create_dynamic_model(**config_data)
+            instance._dir_config = dir_config or instance
+            
+            resolved_data = instance.resolve_placeholders(config_data)
+            instance = cls.create_dynamic_model(**resolved_data)
+            instance._dir_config = dir_config or instance
+            
+            return instance
+        except Exception as e:
+            print(f"Error loading configuration: {str(e)}")
+            raise
+
+    @classmethod
+    def _resolve_path(cls, path: Union[str, Path], default_dir: str) -> Path:
+        base_path = Path(__file__).parent.parent  # This will be two levels up from this file
+        path = Path(path)
+        if not path.suffix:
+            path = base_path / 'sijapi' / default_dir / f"{path.name}.yaml"
+        elif not path.is_absolute():
+            path = base_path / path
+        return path
+
+    def resolve_placeholders(self, data: Any) -> Any:
+        if isinstance(data, dict):
+            return {k: self.resolve_placeholders(v) for k, v in data.items()}
+        elif isinstance(data, list):
+            return [self.resolve_placeholders(v) for v in data]
+        elif isinstance(data, str):
+            return self.resolve_string_placeholders(data)
+        else:
+            return data
+
+    def resolve_string_placeholders(self, value: str) -> Any:
+        pattern = r'\{\{\s*([^}]+)\s*\}\}'
+        matches = re.findall(pattern, value)
+        
+        for match in matches:
+            parts = match.split('.')
+            if len(parts) == 1:  # Internal reference
+                replacement = getattr(self._dir_config, parts[0], str(Path.home() / parts[0].lower()))
+            elif len(parts) == 2 and parts[0] == 'Dir':
+                replacement = getattr(self._dir_config, parts[1], str(Path.home() / parts[1].lower()))
+            elif len(parts) == 2 and parts[0] == 'ENV':
+                replacement = os.getenv(parts[1], '')
+            else:
+                replacement = value  # Keep original if not recognized
+            
+            value = value.replace('{{' + match + '}}', str(replacement))
+        
+        # Convert to Path if it looks like a file path
+        if isinstance(value, str) and (value.startswith(('/', '~')) or (':' in value and value[1] == ':')):
+            return Path(value).expanduser()
+        return value
+
+    @classmethod
+    def create_dynamic_model(cls, **data):
+        for key, value in data.items():
+            if isinstance(value, dict):
+                data[key] = cls.create_dynamic_model(**value)
+            elif isinstance(value, list) and all(isinstance(item, dict) for item in value):
+                data[key] = [cls.create_dynamic_model(**item) for item in value]
+        
+        DynamicModel = create_model(
+            f'Dynamic{cls.__name__}',
+            __base__=cls,
+            **{k: (Any, v) for k, v in data.items()}
+        )
+        return DynamicModel(**data)
+
+    class Config:
+        extra = "allow"
+        arbitrary_types_allowed = True
+
+
+
 class APIConfig(BaseModel):
    HOST: str
    PORT: int
@ -34,7 +147,10 @@ class APIConfig(BaseModel):
    KEYS: List[str]

    @classmethod
-    def load(cls, config_path: Path, secrets_path: Path):
+    def load(cls, config_path: Union[str, Path], secrets_path: Union[str, Path]):
+        config_path = cls._resolve_path(config_path, 'config')
+        secrets_path = cls._resolve_path(secrets_path, 'config')
+
        # Load main configuration
        with open(config_path, 'r') as file:
            config_data = yaml.safe_load(file)
@ -90,6 +206,16 @@ class APIConfig(BaseModel):
        
        return cls(**config_data)

+    @classmethod
+    def _resolve_path(cls, path: Union[str, Path], default_dir: str) -> Path:
+        base_path = Path(__file__).parent.parent  # This will be two levels up from this file
+        path = Path(path)
+        if not path.suffix:
+            path = base_path / "sijapi" / default_dir / f"{path.name}.yaml"
+        elif not path.is_absolute():
+            path = base_path / path
+        return path
+
    @classmethod
    def resolve_placeholders(cls, config_data: Dict[str, Any]) -> Dict[str, Any]:
        def resolve_value(value):
@ -127,87 +253,6 @@ class APIConfig(BaseModel):
        return [module for module, is_active in self.MODULES.__dict__.items() if is_active]


-class Configuration(BaseModel):
-    HOME: Path = Path.home()
-    _dir_config: Optional['Configuration'] = None
-
-    @classmethod
-    def load(cls, yaml_path: Union[str, Path], dir_config: Optional['Configuration'] = None) -> 'Configuration':
-        yaml_path = Path(yaml_path)
-        try:
-            with yaml_path.open('r') as file:
-                config_data = yaml.safe_load(file)
-            
-            print(f"Loaded configuration data: {config_data}")
-            
-            # Ensure HOME is set
-            if config_data.get('HOME') is None:
-                config_data['HOME'] = str(Path.home())
-                print(f"HOME was None in config, set to default: {config_data['HOME']}")
-            
-            load_dotenv()
-            
-            instance = cls.create_dynamic_model(**config_data)
-            instance._dir_config = dir_config or instance
-            
-            resolved_data = instance.resolve_placeholders(config_data)
-            for key, value in resolved_data.items():
-                setattr(instance, key, value)
-            
-            return instance
-        except Exception as e:
-            print(f"Error loading configuration from {yaml_path}: {str(e)}")
-            raise
-
-    def resolve_placeholders(self, data: Any) -> Any:
-        if isinstance(data, dict):
-            return {k: self.resolve_placeholders(v) for k, v in data.items()}
-        elif isinstance(data, list):
-            return [self.resolve_placeholders(v) for v in data]
-        elif isinstance(data, str):
-            return self.resolve_string_placeholders(data)
-        else:
-            return data
-
-    def resolve_string_placeholders(self, value: str) -> Any:
-        pattern = r'\{\{\s*([^}]+)\s*\}\}'
-        matches = re.findall(pattern, value)
-        
-        for match in matches:
-            parts = match.split('.')
-            if len(parts) == 1:  # Internal reference
-                replacement = getattr(self._dir_config, parts[0], str(Path.home() / parts[0].lower()))
-            elif len(parts) == 2 and parts[0] == 'DIR':
-                replacement = getattr(self._dir_config, parts[1], str(Path.home() / parts[1].lower()))
-            elif len(parts) == 2 and parts[0] == 'ENV':
-                replacement = os.getenv(parts[1], '')
-            else:
-                replacement = value  # Keep original if not recognized
-            
-            value = value.replace('{{' + match + '}}', str(replacement))
-        
-        # Convert to Path if it looks like a file path
-        if isinstance(value, str) and (value.startswith(('/', '~')) or (':' in value and value[1] == ':')):
-            return Path(value).expanduser()
-        return value
-
-    @classmethod
-    def create_dynamic_model(cls, **data):
-        for key, value in data.items():
-            if isinstance(value, dict):
-                data[key] = cls.create_dynamic_model(**value)
-        
-        DynamicModel = create_model(
-            f'Dynamic{cls.__name__}',
-            __base__=cls,
-            **{k: (type(v), v) for k, v in data.items()}
-        )
-        return DynamicModel(**data)
-
-    class Config:
-        extra = "allow"
-        arbitrary_types_allowed = True
-

 class Location(BaseModel):
    latitude: float
--- a/sijapi/config/news.yaml-example
+++ b/sijapi/config/news.yaml-example
@ -0,0 +1,32 @@
+sites:
+  - name: The Intercept
+    url: https://theintercept.com
+    max_articles: 5
+    days_back: 14
+    summarize: True
+    tts: off
+    tts_voice: Kiel
+    podcast: True
+  - name: The New York Times
+    url: https://www.nytimes.com
+    max_articles: 10
+    days_back: 7
+    summarize: True
+    tts: off
+    tts_voice: Luna
+    podcast: True
+  - name: The Guardian
+    url: https://theguardian.com
+    max_articles: 10
+    days_back: 7
+    summarize: True
+    tts: off
+    tts_voice: Attenborough
+    podcast: True
+llm:
+  model: llama3
+tts:
+  model: elevenlabs-v2
+  voice: Luna
+  podcast: True
+
--- a/sijapi/routers/news.py
+++ b/sijapi/routers/news.py
@ -1,32 +1,214 @@
-from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
-from fastapi.responses import JSONResponse
-from zoneinfo import ZoneInfo
-from io import BytesIO
-from pydantic import BaseModel
-from bs4 import BeautifulSoup
-import requests
-from markdownify import markdownify as md
 import os
-import mimetypes
-from datetime import datetime as dt_datetime
-import shutil
 import uuid
-import aiohttp
-from pathlib import Path
+import asyncio
+import shutil
+import requests
+import mimetypes
+from io import BytesIO
+from bs4 import BeautifulSoup
+from zoneinfo import ZoneInfo
 from urllib.parse import urlparse
-from urllib3.util.retry import Retry
+from datetime import datetime as dt_datetime, timedelta
 from typing import Optional
+
+import aiohttp
+import aiofiles
 import newspaper
-from newspaper import Article
 import trafilatura
 from readability import Document
+from markdownify import markdownify as md
 from requests.adapters import HTTPAdapter
-from sijapi import API, L, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO
+from urllib3.util.retry import Retry
+
+from fastapi import APIRouter, BackgroundTasks, File, UploadFile, Form, HTTPException, Response, Query, Path as FastAPIPath
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+
+from pathlib import Path
+
+from sijapi.classes import Configuration
+from sijapi import API, L, Dir, News, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, OBSIDIAN_BANNER_SCENE, DEFAULT_11L_VOICE, DEFAULT_VOICE, GEO
 from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path
 from sijapi.routers import llm, tts, asr, loc

+from newspaper import Article
+
+
+
 news = APIRouter()

+async def download_and_save_article(article, site_name, earliest_date, bg_tasks: BackgroundTasks, tts_mode: str = "summary", voice: str = DEFAULT_11L_VOICE):
+    try:
+        url = article.url
+        source = trafilatura.fetch_url(url)
+        
+        if source is None:
+            # Fallback to newspaper3k if trafilatura fails
+            article.download()
+            article.parse()
+            traf = None
+        else:
+            traf = trafilatura.extract_metadata(filecontent=source, default_url=url)
+            article.download()
+            article.parse()
+
+        # Update article properties, preferring trafilatura data when available
+        article.title = traf.title if traf and traf.title else article.title or url
+        article.authors = traf.author if traf and traf.author else article.authors or []
+        article.publish_date = traf.date if traf and traf.date else article.publish_date
+        try:
+            article.publish_date = await loc.dt(article.publish_date, "UTC")
+        except:
+            L.DEBUG(f"Failed to localize {article.publish_date}")
+            article.publish_date = await loc.dt(dt_datetime.now(), "UTC")
+        article.meta_description = traf.description if traf and traf.description else article.meta_description
+        article.text = trafilatura.extract(source, output_format="markdown", include_comments=False) if source else article.text
+        article.top_image = traf.image if traf and traf.image else article.top_image
+        article.source_url = traf.sitename if traf and traf.sitename else urlparse(url).netloc.replace('www.', '').title()
+        article.meta_keywords = traf.categories or traf.tags if traf else article.meta_keywords or []
+        article.meta_keywords = article.meta_keywords if isinstance(article.meta_keywords, list) else [article.meta_keywords]
+
+        if not is_article_within_date_range(article, earliest_date):
+            return False
+
+
+        timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
+        readable_title = sanitize_filename(article.title or timestamp)
+        markdown_filename, relative_path = assemble_journal_path(dt_datetime.now(), subdir="Articles", filename=readable_title, extension=".md")
+
+        summary = await llm.summarize_text(article.text, "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.")
+        summary = summary.replace('\n', ' ')  # Remove line breaks
+
+        if tts_mode == "full" or tts_mode == "content":
+            tts_text = article.text
+        elif tts_mode == "summary" or tts_mode == "excerpt":
+            tts_text = summary
+        else:
+            tts_text = None
+
+        banner_markdown = ''
+        try:
+            banner_url = article.top_image
+            if banner_url:
+                banner_image = download_file(banner_url, Path(OBSIDIAN_VAULT_DIR / OBSIDIAN_RESOURCES_DIR))
+                if banner_image:
+                    banner_markdown = f"![[{OBSIDIAN_RESOURCES_DIR}/{banner_image}]]"
+        except Exception as e:
+            L.ERR(f"No image found in article")
+
+        
+        authors = ', '.join(['[[{}]]'.format(author.strip()) for author in article.authors if author.strip()])
+        if not authors:
+            authors = '[[Unknown Author]]'
+
+        frontmatter = f"""---
+title: {readable_title}
+authors: {authors}
+published: {article.publish_date}
+added: {timestamp}
+banner: "{banner_markdown}"
+tags:
+"""
+        frontmatter += '\n'.join(f" - {tag}" for tag in article.meta_keywords)
+        frontmatter += '\n---\n'
+
+        body = f"# {readable_title}\n\n"
+        if tts_text:
+            audio_filename = f"{article.publish_date.strftime('%Y-%m-%d')} {readable_title}"
+            try:
+                audio_path = await tts.generate_speech(
+                    bg_tasks=bg_tasks, 
+                    text=tts_text, 
+                    voice=voice, 
+                    model="eleven_turbo_v2", 
+                    podcast=True, 
+                    title=audio_filename,
+                    output_dir=Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR
+                )
+                if isinstance(audio_path, Path):
+                    audio_ext = audio_path.suffix
+                    obsidian_link = f"![[{audio_path.name}]]"
+                    body += f"{obsidian_link}\n\n"
+                else:
+                    L.WARN(f"Unexpected audio_path type: {type(audio_path)}. Value: {audio_path}")
+            except Exception as e:
+                L.ERR(f"Failed to generate TTS for {audio_filename}. Error: {str(e)}")
+                L.ERR(f"TTS error details - voice: {voice}, model: eleven_turbo_v2, podcast: True")
+                L.ERR(f"Output directory: {Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_RESOURCES_DIR}")
+
+        body += f"by {authors} in {article.source_url}\n\n"
+        body += f"> [!summary]+\n"
+        body += f"> {summary}\n\n"
+        body += article.text
+
+        markdown_content = frontmatter + body
+
+        with open(markdown_filename, 'w') as md_file:
+            md_file.write(markdown_content)
+
+        L.INFO(f"Successfully saved to {markdown_filename}")
+        add_to_daily_note(relative_path)
+        print(f"Saved article: {relative_path}")
+        return True
+
+
+    except Exception as e:
+        L.ERR(f"Error processing article from {article.url}: {str(e)}")
+        return False
+
+# You'll need to update your is_article_within_date_range function:
+def is_article_within_date_range(article, earliest_date):
+    return article.publish_date is not None and article.publish_date.date() >= earliest_date
+
+async def process_news_site(site, bg_tasks: BackgroundTasks):
+    print(f"Downloading articles from {site.name}...")
+    
+    earliest_date = dt_datetime.now().date() - timedelta(days=site.days_back)
+    
+    try:
+        news_source = newspaper.build(site.url, memoize_articles=False)
+        
+        tasks = []
+        for article in news_source.articles[:site.max_articles]:
+            task = asyncio.create_task(download_and_save_article(
+                article, 
+                site.name, 
+                earliest_date, 
+                bg_tasks, 
+                tts_mode=site.tts if hasattr(site, 'tts') else "off",
+                voice=site.tts if hasattr(site, 'tts') else DEFAULT_11L_VOICE
+            ))
+            tasks.append(task)
+        
+        results = await asyncio.gather(*tasks)
+        articles_downloaded = sum(results)
+        
+        print(f"Downloaded {articles_downloaded} articles from {site.name}")
+    except Exception as e:
+        print(f"Error processing {site.name}: {str(e)}")
+
+# Update your news_refresh_endpoint function:
+@news.get("/news/refresh")
+async def news_refresh_endpoint(bg_tasks: BackgroundTasks):
+    tasks = [process_news_site(site, bg_tasks) for site in News.sites]
+    await asyncio.gather(*tasks)
+    return "OK"
+
+
+async def generate_path(article, site_name):
+    publish_date = await loc.dt(article.publish_date, 'UTC') if article.publish_date else await loc.dt(dt_datetime.now(), 'UTC')
+    title_slug = "".join(c if c.isalnum() else "_" for c in article.title)
+    filename = f"{site_name} - {title_slug[:50]}.md"
+    absolute_path, relative_path = assemble_journal_path(publish_date, 'Articles', filename, extension='.md', no_timestamp=True)
+    return absolute_path, relative_path
+
+async def save_article_to_file(content, output_path):
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    async with aiofiles.open(output_path, 'w', encoding='utf-8') as file:
+        await file.write(content)
+
+
+
 ### CLIPPER ###
@news.post("/clip")
 async def clip_post(
--- a/sijapi/routers/tts.py
+++ b/sijapi/routers/tts.py
@ -143,7 +143,7 @@ async def generate_speech(
        #    raise HTTPException(status_code=400, detail="Invalid model specified")

        if podcast == True:
-            podcast_path = PODCAST_DIR / audio_file_path.name
+            podcast_path = Path(PODCAST_DIR) / audio_file_path.name
            L.DEBUG(f"Podcast path: {podcast_path}")
            shutil.copy(str(audio_file_path), str(podcast_path))
            bg_tasks.add_task(os.remove, str(audio_file_path))
@ -152,7 +152,7 @@ async def generate_speech(
        return str(audio_file_path)

    except Exception as e:
-        L.ERROR(f"Failed to generate speech: {str(e)}")
+        L.ERR(f"Failed to generate speech: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")


@ -331,7 +331,7 @@ async def local_tts(

    # Export the combined audio in a separate thread
    if podcast:
-        podcast_file_path = PODCAST_DIR / file_path.name
+        podcast_file_path = Path(PODCAST_DIR) / file_path.name
        await asyncio.to_thread(combined_audio.export, podcast_file_path, format="wav")
    
    await asyncio.to_thread(combined_audio.export, file_path, format="wav")
@ -425,7 +425,7 @@ def copy_to_podcast_dir(file_path):
        file_name = Path(file_path).name
        
        # Construct the destination path in the PODCAST_DIR
-        destination_path = PODCAST_DIR / file_name
+        destination_path = Path(PODCAST_DIR) / file_name
        
        # Copy the file to the PODCAST_DIR
        shutil.copy(file_path, destination_path)