Auto-update: Mon Aug 5 14:32:28 PDT 2024

2024-08-05 14:32:28 -07:00 · 2024-08-05 14:32:28 -07:00 · ec21f92242
commit ec21f92242
parent 27429df815
5 changed files with 166 additions and 27 deletions
--- a/missing-packages.txt
+++ b/missing-packages.txt
@ -0,0 +1,11 @@
+__future__
+cv2
+dateparser_data
+ionic
+json  # Used for working with JSON data
+llama_cpp
+markdown_it
+phonenumber_field
+pptx
+requests  # Used for making HTTP requests
+sijapi
--- a/requirements.txt
+++ b/requirements.txt
@ -49,6 +49,83 @@ readability
 trafilatura
 urllib3
 anyio
-semaphore
 location
 SRTM.py
+better_profanity
+EventKit
+Foundation
+aiosqlite
+anthropic
+apscheduler
+asgiref
+aura_sr
+authlib
+backports.zoneinfo
+boto3
+click
+colorama
+contextvars
+cron_descriptor
+dateparser
+deprecated
+django
+django_apscheduler
+exa_py
+factory
+faker
+faster_whisper
+ffmpeg
+fire
+flupy
+freezegun
+google
+huggingface_hub
+jinja2
+khoj
+konlpy
+langchain
+langchain_community
+libtmux
+litellm
+llama_index
+lxml
+magika
+moviepy
+neo4j
+nest_asyncio
+nltk
+numpy
+openpyxl
+osmium
+packaging
+pgvector
+posthog
+psutil
+psycopg2
+pypdf
+pytest
+r2r
+redis
+resend
+rich
+schedule
+semaphore
+sentence_transformers
+soundfile
+spacy
+sqlalchemy
+stripe
+tailscale
+tenacity
+tiktoken
+torchaudio
+transformers
+twilio
+typing_extensions
+uvicorn
+vecs
+vectordb
+websockets
+whisper
+whisperplus
+youtube_dl
--- a/sijapi/init.py
+++ b/sijapi/init.py
@ -27,6 +27,7 @@ SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255')
 MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count())
 IMG = Configuration.load('img', 'secrets')
 News = Configuration.load('news', 'secrets')
+Archivist = Configuration.load('archivist')
 Scrape = Configuration.load('scrape', 'secrets', Dir)
 Serve = Configuration.load('serve')

--- a/sijapi/routers/news.py
+++ b/sijapi/routers/news.py
@ -11,7 +11,7 @@ import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 from datetime import datetime as dt_datetime, timedelta
-from typing import Optional
+from typing import Optional, List, Tuple
 import aiohttp
 import aiofiles
 import newspaper
@ -19,12 +19,13 @@ import trafilatura
 from newspaper import Article
 from readability import Document
 from markdownify import markdownify as md
+from better_profanity import profanity
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
 from pathlib import Path
-from sijapi import L, News, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
-from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path
+from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
+from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
 from sijapi.routers import gis, llm, tts, note

 news = APIRouter()
@ -329,12 +330,13 @@ async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]
    return markdown_content


+
 async def process_archive(
    url: str,
    title: Optional[str] = None,
    encoding: str = 'utf-8',
    source: Optional[str] = None,
-) -> Path:
+) -> Optional[Path]:
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    readable_title = title if title else f"{url} - {timestamp}"
    
@ -342,16 +344,24 @@ async def process_archive(
    if content is None:
        raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
    
-    markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
+    if contains_profanity(url, content, 0.2, Archivist.blacklist):
+        info(f"Not archiving {url} due to profanity")
+        return None
+    
+    try:
+        markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
+    except Exception as e:
+        warn(f"Failed to assemble archive path for {url}: {str(e)}")
+        return None
    
    markdown_content = f"---\n"
-    markdown_content += f"title: {readable_title}\n"
+    markdown_content += f"title: \"{readable_title}\"\n"
    markdown_content += f"added: {timestamp}\n"
-    markdown_content += f"url: {url}"
-    markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
+    markdown_content += f"url: \"{url}\"\n"
+    markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
    markdown_content += f"---\n\n"
    markdown_content += f"# {readable_title}\n\n"
-    markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
+    markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
    markdown_content += content
    
    try:
@ -365,6 +375,7 @@ async def process_archive(
        return None


+
 def download_file(url, folder):
    os.makedirs(folder, exist_ok=True)
    filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
--- a/sijapi/utilities.py
+++ b/sijapi/utilities.py
@ -12,11 +12,12 @@ from dateutil import parser
 from pathlib import Path
 import filetype
 from PyPDF2 import PdfReader
+from better_profanity import profanity
 from pdfminer.high_level import extract_text as pdfminer_extract_text
 import pytesseract
 from pdf2image import convert_from_path
 from datetime import datetime, date, time
-from typing import Optional, Union, Tuple
+from typing import Optional, Union, Tuple, List
 import asyncio
 from PIL import Image
 import pandas as pd
@ -28,7 +29,7 @@ from sshtunnel import SSHTunnelForwarder
 from fastapi import Depends, HTTPException, Request, UploadFile
 from fastapi.security.api_key import APIKeyHeader

-from sijapi import L, API, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
+from sijapi import L, API, Archivist, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR

 logger = L.get_module_logger('utilities')
 def debug(text: str): logger.debug(text)
@ -63,35 +64,58 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)):
    raise HTTPException(status_code=401, detail="Invalid or missing API key")


-def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
+def assemble_archive_path(filename: str, extension: str = None, date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
    year = date_time.strftime(YEAR_FMT)
    month = date_time.strftime(MONTH_FMT)
    day = date_time.strftime(DAY_FMT)
    day_short = date_time.strftime(DAY_SHORT_FMT)
    timestamp = date_time.strftime("%H%M%S")
    
-    # Ensure the extension is preserved
-    base_name, ext = os.path.splitext(filename)
-    extension = ext if ext else extension
+    # Handle extension priority
+    base_name, original_ext = os.path.splitext(filename)
+    
+    if extension is not None:
+        # Use the provided extension parameter
+        final_extension = extension if extension.startswith('.') else f'.{extension}'
+    elif original_ext:
+        # Use the original file extension if present
+        final_extension = original_ext
+    else:
+        # Default to ".md" if no extension is provided or present
+        final_extension = ".md"
    
    # Initial sanitization
    sanitized_base = sanitize_filename(base_name, '')
-    filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
+    filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}"
    
    relative_path = Path(year) / month / day / filename
-    absolute_path = ARCHIVE_DIR / relative_path
+    absolute_path = Archivist.dir / relative_path
    
    # Ensure the total path length doesn't exceed MAX_PATH_LENGTH
-    while len(str(absolute_path)) > MAX_PATH_LENGTH:
-        # Truncate the sanitized_base, not the full filename
+    while len(str(absolute_path)) > MAX_PATH_LENGTH and len(sanitized_base) > 0:
        sanitized_base = sanitized_base[:-1]
-        filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
+        filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}"
        relative_path = Path(year) / month / day / filename
        absolute_path = ARCHIVE_DIR / relative_path
    
+    # If we've exhausted sanitized_base and the path is still too long
+    if len(str(absolute_path)) > MAX_PATH_LENGTH:
+        # Use a hash of the original filename to ensure uniqueness
+        hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8]
+        filename = f"{day_short} {timestamp} {hash_suffix}{final_extension}"
+        relative_path = Path(year) / month / day / filename
+        absolute_path = ARCHIVE_DIR / relative_path
+    
+    # Final check and truncation if necessary
+    if len(str(absolute_path)) > MAX_PATH_LENGTH:
+        overflow = len(str(absolute_path)) - MAX_PATH_LENGTH
+        absolute_path = Path(str(absolute_path)[:-overflow])
+        relative_path = Path(str(relative_path)[:-overflow])
+    
    return absolute_path, relative_path


+
 def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]:
    '''
    Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension.
@ -161,6 +185,21 @@ def f(file):
        return thefile

            
+def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
+    custom_words = custom_words or []
+    if any(word.lower() in url.lower() for word in custom_words):
+        info(f"Blacklisted word in {url}")
+        return True
+
+    # Check content for profanity
+    profanity.load_censor_words(custom_words)
+    word_list = content.split()
+    content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
+    content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
+    debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
+    return content_profanity_ratio >= threshold
+
+
 def get_extension(file):
    try:
        if isinstance(file, str):