Auto-update: Wed Jun 26 10:02:40 PDT 2024

2024-06-26 10:02:40 -07:00 · 2024-06-26 10:02:40 -07:00 · 04cd4c5bf2
commit 04cd4c5bf2
parent 45059a9555
5 changed files with 246 additions and 167 deletions
--- a/sijapi/init.py
+++ b/sijapi/init.py
@ -161,7 +161,8 @@ ICS_PATH = DATA_DIR / 'calendar.ics' # deprecated now, but maybe revive?
 ICALENDARS = os.getenv('ICALENDARS', 'NULL,VOID').split(',')
 EMAIL_CONFIG = CONFIG_DIR / "email.yaml"
-AUTORESPOND = True
+EMAIL_LOGS = LOGS_DIR / "email"
 os.makedirs(EMAIL_LOGS, exist_ok = True)
 ### Courtlistener & other webhooks
 COURTLISTENER_DOCKETS_DIR = DATA_DIR / "courtlistener" / "dockets"
--- a/sijapi/helpers/log_prior_emails.py
+++ b/sijapi/helpers/log_prior_emails.py
@ -0,0 +1,64 @@
 import asyncio
 from pathlib import Path
 from sijapi import DEBUG, INFO, ERR
 from sijapi import EMAIL_CONFIG, EMAIL_LOGS
 from sijapi.classes import EmailAccount
 from sijapi.routers import email
 async def initialize_log_files():
    summarized_log = EMAIL_LOGS / "summarized.txt"
    autoresponded_log = EMAIL_LOGS / "autoresponded.txt"
    diagnostic_log = EMAIL_LOGS / "diagnostic.txt"
    for log_file in [summarized_log, autoresponded_log, diagnostic_log]:
        log_file.parent.mkdir(parents=True, exist_ok=True)
        log_file.write_text("")
    DEBUG(f"Log files initialized: {summarized_log}, {autoresponded_log}, {diagnostic_log}")
    return summarized_log, autoresponded_log, diagnostic_log
 async def process_all_emails(account: EmailAccount, summarized_log: Path, autoresponded_log: Path, diagnostic_log: Path):
    try:
        with email.get_imap_connection(account) as inbox:
            DEBUG(f"Connected to {account.name}, processing all emails...")
            all_messages = inbox.messages()
            unread_messages = set(uid for uid, _ in inbox.messages(unread=True))
            processed_count = 0
            for identifier, message in all_messages:
                # Log diagnostic information
                with open(diagnostic_log, 'a') as f:
                    f.write(f"Account: {account.name}, Raw Identifier: {identifier}, Type: {type(identifier)}\n")
                # Attempt to get a string representation of the identifier
                if isinstance(identifier, bytes):
                    id_str = identifier.decode()
                elif isinstance(identifier, (int, str)):
                    id_str = str(identifier)
                else:
                    id_str = repr(identifier)
                if identifier not in unread_messages:
                    processed_count += 1
                    for log_file in [summarized_log, autoresponded_log]:
                        with open(log_file, 'a') as f:
                            f.write(f"{id_str}\n")
            INFO(f"Processed {processed_count} non-unread emails for account {account.name}")
    except Exception as e:
        ERR(f"An error occurred while processing emails for account {account.name}: {e}")
 async def main():
    email_accounts = email.load_email_accounts(EMAIL_CONFIG)
    summarized_log, autoresponded_log, diagnostic_log = await initialize_log_files()
    DEBUG(f"Processing {len(email_accounts)} email accounts")
    tasks = [process_all_emails(account, summarized_log, autoresponded_log, diagnostic_log) for account in email_accounts]
    await asyncio.gather(*tasks)
    # Final verification
    with open(summarized_log, 'r') as f:
        final_count = len(f.readlines())
    INFO(f"Final non-unread email count: {final_count}")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/sijapi/logs.py
+++ b/sijapi/logs.py
@ -1,65 +1,36 @@
 import os
 import sys
-import logging
+from loguru import logger
 from logging.handlers import RotatingFileHandler
 from colorama import Fore, Back, Style, init as colorama_init
 import traceback
 # Force colorama to initialize for the current platform
 colorama_init(autoreset=True, strip=False, convert=True)
 class ColorFormatter(logging.Formatter):
    """Custom formatter to add colors to log levels."""
    COLOR_MAP = {
        logging.DEBUG: Fore.CYAN,
        logging.INFO: Fore.GREEN,
        logging.WARNING: Fore.YELLOW,
        logging.ERROR: Fore.RED,
        logging.CRITICAL: Fore.MAGENTA + Back.WHITE,
    }
    def format(self, record):
        log_message = super().format(record)
        color = self.COLOR_MAP.get(record.levelno, '')
        return f"{color}{log_message}{Style.RESET_ALL}"
 class Logger:
    def __init__(self, name, logs_dir):
        self.logs_dir = logs_dir
-        self.logger = logging.getLogger(name)
+        self.name = name
-        self.logger.setLevel(logging.DEBUG)
+        self.logger = logger.bind(name=name)
    def setup_from_args(self, args):
        if not os.path.exists(self.logs_dir):
            os.makedirs(self.logs_dir)
        # Remove default logger
        logger.remove()
        # File handler
        handler_path = os.path.join(self.logs_dir, 'app.log')
-        file_handler = RotatingFileHandler(handler_path, maxBytes=2000000, backupCount=10)
+        logger.add(handler_path, rotation="2 MB", compression="zip", level="DEBUG", format="{time:YYYY-MM-DD HH:mm:ss} - {name} - {level} - {message}")
        file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
        file_handler.setLevel(logging.DEBUG)
        # Console handler
-        console_handler = logging.StreamHandler(sys.stdout)  # Explicitly use sys.stdout
+        log_format = "<cyan>{time:YYYY-MM-DD HH:mm:ss}</cyan> - <cyan>{name}</cyan> - <level>{level: <8}</level> - <level>{message}</level>"
-        console_formatter = ColorFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        console_level = "DEBUG" if args.debug else "INFO"
-        console_handler.setFormatter(console_formatter)
+        logger.add(sys.stdout, format=log_format, level=console_level, colorize=True)
        # Set console handler level based on args
        if args.debug:
            console_handler.setLevel(logging.DEBUG)
        else:
            console_handler.setLevel(logging.INFO)
        # Add handlers to logger
        self.logger.addHandler(file_handler)
        self.logger.addHandler(console_handler)
        # Test color output
        self.logger.debug("Debug message (should be Cyan)")
        self.logger.info("Info message (should be Green)")
        self.logger.warning("Warning message (should be Yellow)")
        self.logger.error("Error message (should be Red)")
-        self.logger.critical("Critical message (should be Magenta on White)")
+        self.logger.critical("Critical message (should be Magenta)")
    def get_logger(self):
        return self.logger
@ -71,9 +42,9 @@ if __name__ == "__main__":
    parser.add_argument('--debug', action='store_true')
    args = parser.parse_args()
-    logger = Logger("test", "logs")
+    logger_instance = Logger("test", "logs")
-    logger.setup_from_args(args)
+    logger_instance.setup_from_args(args)
-    test_logger = logger.get_logger()
+    test_logger = logger_instance.get_logger()
    print("FORCE_COLOR:", os.environ.get('FORCE_COLOR'))
    print("NO_COLOR:", os.environ.get('NO_COLOR'))
--- a/sijapi/routers/asr.py
+++ b/sijapi/routers/asr.py
@ -1,67 +1,50 @@
-'''
+import os
-Automatic Speech Recognition module relying on the `whisper_cpp` implementation of OpenAI's Whisper model.
+import sys
-Depends on:
+import uuid
-  LOGGER, ASR_DIR, WHISPER_CPP_MODELS, GARBAGE_COLLECTION_INTERVAL, GARBAGE_TTL, WHISPER_CPP_DIR
+import json
-Notes: 
+import asyncio
-  Performs exceptionally well on Apple Silicon. Other devices will benefit from future updates to optionally use `faster_whisper`, `insanely_faster_whisper`, and/or `whisper_jax`.
+import tempfile
-'''
+import subprocess
-
+from urllib.parse import unquote
-from fastapi import APIRouter, HTTPException, Form, UploadFile, File
+from fastapi import APIRouter, HTTPException, Form, UploadFile, File, BackgroundTasks
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 from typing import Optional
 import tempfile
 from fastapi.responses import JSONResponse, FileResponse
 from pydantic import BaseModel, HttpUrl
 from whisperplus.pipelines import mlx_whisper
 from youtube_dl import YoutubeDL
 from urllib.parse import unquote
 import subprocess
 import os
 import uuid
 from threading import Thread
 import multiprocessing
 import asyncio
 import subprocess
 import tempfile
 from sijapi import DEBUG, INFO, WARN, ERR, CRITICAL, ASR_DIR, WHISPER_CPP_MODELS, GARBAGE_COLLECTION_INTERVAL, GARBAGE_TTL, WHISPER_CPP_DIR, MAX_CPU_CORES
 from sijapi import DEBUG, INFO, WARN, ERR, CRITICAL, ASR_DIR, WHISPER_CPP_MODELS, GARBAGE_COLLECTION_INTERVAL, GARBAGE_TTL, WHISPER_CPP_DIR, MAX_CPU_CORES
 asr = APIRouter()
 class TranscribeParams(BaseModel):
    model: str = Field(default="small")
-    output_srt : Optional[bool] = Field(default=False)
+    output_srt: Optional[bool] = Field(default=False)
-    language : Optional[str] = Field(None)
+    language: Optional[str] = Field(None)
-    split_on_word : Optional[bool] = Field(default=False)
+    split_on_word: Optional[bool] = Field(default=False)
-    temperature : Optional[float] = Field(default=0)
+    temperature: Optional[float] = Field(default=0)
-    temp_increment : Optional[int] = Field(None)
+    temp_increment: Optional[int] = Field(None)
-    translate : Optional[bool] = Field(default=False)
+    translate: Optional[bool] = Field(default=False)
-    diarize : Optional[bool] = Field(default=False)
+    diarize: Optional[bool] = Field(default=False)
-    tiny_diarize : Optional[bool] = Field(default=False)
+    tiny_diarize: Optional[bool] = Field(default=False)
-    no_fallback : Optional[bool] = Field(default=False)
+    no_fallback: Optional[bool] = Field(default=False)
-    output_json : Optional[bool] = Field(default=False)
+    output_json: Optional[bool] = Field(default=False)
-    detect_language : Optional[bool] = Field(default=False)
+    detect_language: Optional[bool] = Field(default=False)
-    dtw : Optional[str] = Field(None)
+    dtw: Optional[str] = Field(None)
-    threads : Optional[int] = Field(None)
+    threads: Optional[int] = Field(None)
-from urllib.parse import unquote
+# Global dictionary to store transcription results
-import json
+transcription_results = {}
@asr.post("/asr")
@asr.post("/transcribe")
@asr.post("/v1/audio/transcription")
 async def transcribe_endpoint(
    background_tasks: BackgroundTasks,
    file: UploadFile = File(...),
    params: str = Form(...)
 ):
    try:
        # Decode the URL-encoded string
        decoded_params = unquote(params)
        # Parse the JSON string
        parameters_dict = json.loads(decoded_params)
        # Create TranscribeParams object
        parameters = TranscribeParams(**parameters_dict)
    except json.JSONDecodeError as json_err:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(json_err)}")
@ -72,12 +55,30 @@ async def transcribe_endpoint(
        temp_file.write(await file.read())
        temp_file_path = temp_file.name
-    transcription = await transcribe_audio(file_path=temp_file_path, params=parameters)
+    transcription_job = await transcribe_audio(file_path=temp_file_path, params=parameters, background_tasks=background_tasks)
-    return transcription
+    job_id = transcription_job["job_id"]
-async def transcribe_audio(file_path, params: TranscribeParams):
+    # Poll for completion
    max_wait_time = 600  # 10 minutes
    poll_interval = 2  # 2 seconds
    elapsed_time = 0
-    file_path = convert_to_wav(file_path)
+    while elapsed_time < max_wait_time:
        if job_id in transcription_results:
            result = transcription_results[job_id]
            if result["status"] == "completed":
                return JSONResponse(content={"status": "completed", "result": result["result"]})
            elif result["status"] == "failed":
                return JSONResponse(content={"status": "failed", "error": result["error"]}, status_code=500)
        await asyncio.sleep(poll_interval)
        elapsed_time += poll_interval
    # If we've reached this point, the transcription has taken too long
    return JSONResponse(content={"status": "timeout", "message": "Transcription is taking longer than expected. Please check back later."}, status_code=202)
 async def transcribe_audio(file_path, params: TranscribeParams, background_tasks: BackgroundTasks):
    file_path = await convert_to_wav(file_path)
    model = params.model if params.model in WHISPER_CPP_MODELS else 'small' 
    model_path = WHISPER_CPP_DIR / 'models' / f'ggml-{model}.bin'
    command = [str(WHISPER_CPP_DIR / 'build' / 'bin' / 'main')]
@ -115,35 +116,50 @@ async def transcribe_audio(file_path, params: TranscribeParams):
    command.extend(['-f', file_path])
    DEBUG(f"Command: {command}")
    # Create a unique ID for this transcription job
    job_id = str(uuid.uuid4())
    # Store the job status
    transcription_results[job_id] = {"status": "processing", "result": None}
    # Run the transcription in a background task
    background_tasks.add_task(process_transcription, command, file_path, job_id)
    return {"job_id": job_id}
 async def process_transcription(command, file_path, job_id):
    try:
        result = await run_transcription(command, file_path)
        transcription_results[job_id] = {"status": "completed", "result": result}
    except Exception as e:
        transcription_results[job_id] = {"status": "failed", "error": str(e)}
    finally:
        # Clean up the temporary file
        os.remove(file_path)
 async def run_transcription(command, file_path):
    proc = await asyncio.create_subprocess_exec(
        *command,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE
    )
    stdout, stderr = await proc.communicate()
    if proc.returncode != 0:
        raise Exception(f"Error running command: {stderr.decode()}")
    return stdout.decode().strip()
-    result = stdout.decode().strip()
+async def convert_to_wav(file_path: str):
    DEBUG(f"Result: {result}")
    return result
 def convert_to_wav(file_path: str):
    wav_file_path = os.path.join(ASR_DIR, f"{uuid.uuid4()}.wav")
-    subprocess.run(["ffmpeg", "-y", "-i", file_path, "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", wav_file_path], check=True)
+    proc = await asyncio.create_subprocess_exec(
        "ffmpeg", "-y", "-i", file_path, "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", wav_file_path,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE
    )
    stdout, stderr = await proc.communicate()
    if proc.returncode != 0:
        raise Exception(f"Error converting file to WAV: {stderr.decode()}")
    return wav_file_path
 def download_from_youtube(url: str):
    temp_file = os.path.join(ASR_DIR, f"{uuid.uuid4()}.mp3")
    ytdl_opts = {
        'outtmpl': temp_file,
        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
        'nooverwrites': True
    }
    with YoutubeDL(ytdl_opts) as ydl:
        ydl.download([url])
    return convert_to_wav(temp_file)
 def format_srt_timestamp(seconds: float):
    milliseconds = round(seconds * 1000.0)
--- a/sijapi/routers/email.py
+++ b/sijapi/routers/email.py
@ -15,21 +15,16 @@ from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from email.mime.image import MIMEImage
 import ssl
 from datetime import datetime as dt_datetime
 from pydantic import BaseModel
 from typing import List, Optional, Any
 import yaml
-from typing import List, Dict, Optional
+from typing import List, Dict, Optional, Set
 from pydantic import BaseModel
 from sijapi import DEBUG, ERR, LLM_SYS_MSG
 from datetime import datetime as dt_datetime
 from typing import Dict
 from sijapi import DEBUG, INFO, WARN, ERR, CRITICAL
-from sijapi import PODCAST_DIR, DEFAULT_VOICE, EMAIL_CONFIG
+from sijapi import PODCAST_DIR, DEFAULT_VOICE, EMAIL_CONFIG, EMAIL_LOGS
 from sijapi.routers import tts, llm, sd, locate
 from sijapi.utilities import clean_text, assemble_journal_path, extract_text, prefix_lines
 from sijapi.classes import EmailAccount, IMAPConfig, SMTPConfig, IncomingEmail, EmailContact, AutoResponder
-
+from sijapi import DEBUG, INFO, ERR, LOGS_DIR
 from sijapi.classes import EmailAccount
 email = APIRouter(tags=["private"])
@ -76,8 +71,6 @@ def get_smtp_connection(account: EmailAccount):
        return SMTP(account.smtp.host, account.smtp.port)
 def get_matching_autoresponders(this_email: IncomingEmail, account: EmailAccount) -> List[AutoResponder]:
    def matches_list(item: str, this_email: IncomingEmail) -> bool:
        if '@' in item:
@ -161,50 +154,6 @@ async def extract_attachments(attachments) -> List[str]:
    return attachment_texts
 async def process_account(account: EmailAccount):
    while True:
        start_time = dt_datetime.now()
        try:
            DEBUG(f"Connecting to {account.name} to check for unread emails...")
            with get_imap_connection(account) as inbox:
                DEBUG(f"Connected to {account.name}, checking for unread emails now...")
                unread_messages = inbox.messages(unread=True)
                for uid, message in unread_messages:
                    recipients = [EmailContact(email=recipient['email'], name=recipient.get('name', '')) for recipient in message.sent_to]
                    localized_datetime = await locate.localize_datetime(message.date)
                    this_email = IncomingEmail(
                        sender=message.sent_from[0]['email'],
                        datetime_received=localized_datetime,
                        recipients=recipients,
                        subject=message.subject,
                        body=clean_email_content(message.body['html'][0]) if message.body['html'] else clean_email_content(message.body['plain'][0]) or "",
                        attachments=message.attachments
                    )
                    DEBUG(f"\n\nProcessing email for account {account.name}: {this_email.subject}\n\n")
                    save_success = await save_email(this_email, account)
                    respond_success = await autorespond(this_email, account)
                    if save_success and respond_success:
                        inbox.mark_seen(uid)
        except Exception as e:
            ERR(f"An error occurred for account {account.name}: {e}")
        # Calculate the time taken for processing
        processing_time = (dt_datetime.now() - start_time).total_seconds()
        # Calculate the remaining time to wait
        wait_time = max(0, account.refresh - processing_time)
        # Wait for the remaining time
        await asyncio.sleep(wait_time)
 async def process_all_accounts():
    email_accounts = load_email_accounts(EMAIL_CONFIG)
    tasks = [asyncio.create_task(process_account(account)) for account in email_accounts]
    await asyncio.gather(*tasks)
 async def save_email(this_email: IncomingEmail, account: EmailAccount):
    try:
        md_path, md_relative = assemble_journal_path(this_email.datetime_received, "Emails", this_email.subject, ".md")
@ -262,6 +211,7 @@ tags:
 async def autorespond(this_email: IncomingEmail, account: EmailAccount):
    matching_profiles = get_matching_autoresponders(this_email, account)
    DEBUG(f"Matching profiles: {matching_profiles}")
    for profile in matching_profiles:
        DEBUG(f"Auto-responding to {this_email.subject} with profile: {profile.name}")
        auto_response_subject = f"Auto-Response Re: {this_email.subject}"
@ -297,6 +247,83 @@ async def send_auto_response(to_email, subject, body, profile, account):
        return False
 async def load_processed_uids(filename: Path) -> Set[str]:
    if filename.exists():
        with open(filename, 'r') as f:
            return set(line.strip().split(':')[-1] for line in f)
    return set()
 async def save_processed_uid(filename: Path, account_name: str, uid: str):
    with open(filename, 'a') as f:
        f.write(f"{account_name}:{uid}\n")
 async def process_account_summarization(account: EmailAccount):
    summarized_log = EMAIL_LOGS / "summarized.txt"
    while True:
        try:
            processed_uids = await load_processed_uids(summarized_log)
            with get_imap_connection(account) as inbox:
                unread_messages = inbox.messages(unread=True)
                for uid, message in unread_messages:
                    uid_str = uid.decode() if isinstance(uid, bytes) else str(uid)
                    if uid_str not in processed_uids:
                        recipients = [EmailContact(email=recipient['email'], name=recipient.get('name', '')) for recipient in message.sent_to]
                        localized_datetime = await locate.localize_datetime(message.date)
                        this_email = IncomingEmail(
                            sender=message.sent_from[0]['email'],
                            datetime_received=localized_datetime,
                            recipients=recipients,
                            subject=message.subject,
                            body=clean_email_content(message.body['html'][0]) if message.body['html'] else clean_email_content(message.body['plain'][0]) or "",
                            attachments=message.attachments
                        )
                        if account.summarize:
                            save_success = await save_email(this_email, account)
                            if save_success:
                                await save_processed_uid(summarized_log, account.name, uid_str)
                                DEBUG(f"Summarized email: {uid_str}")
        except Exception as e:
            ERR(f"An error occurred during summarization for account {account.name}: {e}")
        await asyncio.sleep(account.refresh)
 async def process_account_autoresponding(account: EmailAccount):
    autoresponded_log = EMAIL_LOGS / "autoresponded.txt"
    while True:
        try:
            processed_uids = await load_processed_uids(autoresponded_log)
            with get_imap_connection(account) as inbox:
                unread_messages = inbox.messages(unread=True)
                for uid, message in unread_messages:
                    uid_str = uid.decode() if isinstance(uid, bytes) else str(uid)
                    if uid_str not in processed_uids:
                        recipients = [EmailContact(email=recipient['email'], name=recipient.get('name', '')) for recipient in message.sent_to]
                        localized_datetime = await locate.localize_datetime(message.date)
                        this_email = IncomingEmail(
                            sender=message.sent_from[0]['email'],
                            datetime_received=localized_datetime,
                            recipients=recipients,
                            subject=message.subject,
                            body=clean_email_content(message.body['html'][0]) if message.body['html'] else clean_email_content(message.body['plain'][0]) or "",
                            attachments=message.attachments
                        )
                        respond_success = await autorespond(this_email, account)
                        if respond_success:
                            await save_processed_uid(autoresponded_log, account.name, uid_str)
                            DEBUG(f"Auto-responded to email: {uid_str}")
        except Exception as e:
            ERR(f"An error occurred during auto-responding for account {account.name}: {e}")
        await asyncio.sleep(account.refresh)
 async def process_all_accounts():
    email_accounts = load_email_accounts(EMAIL_CONFIG)
    summarization_tasks = [asyncio.create_task(process_account_summarization(account)) for account in email_accounts]
    autoresponding_tasks = [asyncio.create_task(process_account_autoresponding(account)) for account in email_accounts]
    await asyncio.gather(*summarization_tasks, *autoresponding_tasks)
@email.on_event("startup")
 async def startup_event():
    asyncio.create_task(process_all_accounts())