diff --git a/missing-packages.txt b/missing-packages.txt new file mode 100644 index 0000000..8c0dda4 --- /dev/null +++ b/missing-packages.txt @@ -0,0 +1,11 @@ +__future__ +cv2 +dateparser_data +ionic +json # Used for working with JSON data +llama_cpp +markdown_it +phonenumber_field +pptx +requests # Used for making HTTP requests +sijapi diff --git a/requirements.txt b/requirements.txt index 1a93536..a027aae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,6 +49,83 @@ readability trafilatura urllib3 anyio -semaphore location -SRTM.py \ No newline at end of file +SRTM.py +better_profanity +EventKit +Foundation +aiosqlite +anthropic +apscheduler +asgiref +aura_sr +authlib +backports.zoneinfo +boto3 +click +colorama +contextvars +cron_descriptor +dateparser +deprecated +django +django_apscheduler +exa_py +factory +faker +faster_whisper +ffmpeg +fire +flupy +freezegun +google +huggingface_hub +jinja2 +khoj +konlpy +langchain +langchain_community +libtmux +litellm +llama_index +lxml +magika +moviepy +neo4j +nest_asyncio +nltk +numpy +openpyxl +osmium +packaging +pgvector +posthog +psutil +psycopg2 +pypdf +pytest +r2r +redis +resend +rich +schedule +semaphore +sentence_transformers +soundfile +spacy +sqlalchemy +stripe +tailscale +tenacity +tiktoken +torchaudio +transformers +twilio +typing_extensions +uvicorn +vecs +vectordb +websockets +whisper +whisperplus +youtube_dl diff --git a/sijapi/__init__.py b/sijapi/__init__.py index 981296a..093026b 100644 --- a/sijapi/__init__.py +++ b/sijapi/__init__.py @@ -27,6 +27,7 @@ SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255') MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count()) IMG = Configuration.load('img', 'secrets') News = Configuration.load('news', 'secrets') +Archivist = Configuration.load('archivist') Scrape = Configuration.load('scrape', 'secrets', Dir) Serve = Configuration.load('serve') diff --git a/sijapi/routers/news.py b/sijapi/routers/news.py index 635cb8c..3da80c0 100644 --- a/sijapi/routers/news.py +++ b/sijapi/routers/news.py @@ -11,7 +11,7 @@ import requests from bs4 import BeautifulSoup from urllib.parse import urlparse from datetime import datetime as dt_datetime, timedelta -from typing import Optional +from typing import Optional, List, Tuple import aiohttp import aiofiles import newspaper @@ -19,12 +19,13 @@ import trafilatura from newspaper import Article from readability import Document from markdownify import markdownify as md +from better_profanity import profanity from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath from pathlib import Path -from sijapi import L, News, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE -from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path +from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE +from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity from sijapi.routers import gis, llm, tts, note news = APIRouter() @@ -329,12 +330,13 @@ async def html_to_markdown(url: str = None, source: str = None) -> Optional[str] return markdown_content + async def process_archive( url: str, title: Optional[str] = None, encoding: str = 'utf-8', source: Optional[str] = None, -) -> Path: +) -> Optional[Path]: timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') readable_title = title if title else f"{url} - {timestamp}" @@ -342,16 +344,24 @@ async def process_archive( if content is None: raise HTTPException(status_code=400, detail="Failed to convert content to markdown") - markdown_path, relative_path = assemble_archive_path(readable_title, ".md") + if contains_profanity(url, content, 0.2, Archivist.blacklist): + info(f"Not archiving {url} due to profanity") + return None + + try: + markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md") + except Exception as e: + warn(f"Failed to assemble archive path for {url}: {str(e)}") + return None markdown_content = f"---\n" - markdown_content += f"title: {readable_title}\n" + markdown_content += f"title: \"{readable_title}\"\n" markdown_content += f"added: {timestamp}\n" - markdown_content += f"url: {url}" - markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}" + markdown_content += f"url: \"{url}\"\n" + markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n" markdown_content += f"---\n\n" markdown_content += f"# {readable_title}\n\n" - markdown_content += f"Clipped from [{url}]({url}) on {timestamp}" + markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n" markdown_content += content try: @@ -365,6 +375,7 @@ async def process_archive( return None + def download_file(url, folder): os.makedirs(folder, exist_ok=True) filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1] diff --git a/sijapi/utilities.py b/sijapi/utilities.py index 64bfd79..b2642a8 100644 --- a/sijapi/utilities.py +++ b/sijapi/utilities.py @@ -12,11 +12,12 @@ from dateutil import parser from pathlib import Path import filetype from PyPDF2 import PdfReader +from better_profanity import profanity from pdfminer.high_level import extract_text as pdfminer_extract_text import pytesseract from pdf2image import convert_from_path from datetime import datetime, date, time -from typing import Optional, Union, Tuple +from typing import Optional, Union, Tuple, List import asyncio from PIL import Image import pandas as pd @@ -28,7 +29,7 @@ from sshtunnel import SSHTunnelForwarder from fastapi import Depends, HTTPException, Request, UploadFile from fastapi.security.api_key import APIKeyHeader -from sijapi import L, API, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR +from sijapi import L, API, Archivist, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR logger = L.get_module_logger('utilities') def debug(text: str): logger.debug(text) @@ -63,35 +64,58 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)): raise HTTPException(status_code=401, detail="Invalid or missing API key") -def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]: +def assemble_archive_path(filename: str, extension: str = None, date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]: year = date_time.strftime(YEAR_FMT) month = date_time.strftime(MONTH_FMT) day = date_time.strftime(DAY_FMT) day_short = date_time.strftime(DAY_SHORT_FMT) timestamp = date_time.strftime("%H%M%S") - - # Ensure the extension is preserved - base_name, ext = os.path.splitext(filename) - extension = ext if ext else extension - + + # Handle extension priority + base_name, original_ext = os.path.splitext(filename) + + if extension is not None: + # Use the provided extension parameter + final_extension = extension if extension.startswith('.') else f'.{extension}' + elif original_ext: + # Use the original file extension if present + final_extension = original_ext + else: + # Default to ".md" if no extension is provided or present + final_extension = ".md" + # Initial sanitization sanitized_base = sanitize_filename(base_name, '') - filename = f"{day_short} {timestamp} {sanitized_base}{extension}" - + filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}" + relative_path = Path(year) / month / day / filename - absolute_path = ARCHIVE_DIR / relative_path - + absolute_path = Archivist.dir / relative_path + # Ensure the total path length doesn't exceed MAX_PATH_LENGTH - while len(str(absolute_path)) > MAX_PATH_LENGTH: - # Truncate the sanitized_base, not the full filename + while len(str(absolute_path)) > MAX_PATH_LENGTH and len(sanitized_base) > 0: sanitized_base = sanitized_base[:-1] - filename = f"{day_short} {timestamp} {sanitized_base}{extension}" + filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}" relative_path = Path(year) / month / day / filename absolute_path = ARCHIVE_DIR / relative_path - + + # If we've exhausted sanitized_base and the path is still too long + if len(str(absolute_path)) > MAX_PATH_LENGTH: + # Use a hash of the original filename to ensure uniqueness + hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8] + filename = f"{day_short} {timestamp} {hash_suffix}{final_extension}" + relative_path = Path(year) / month / day / filename + absolute_path = ARCHIVE_DIR / relative_path + + # Final check and truncation if necessary + if len(str(absolute_path)) > MAX_PATH_LENGTH: + overflow = len(str(absolute_path)) - MAX_PATH_LENGTH + absolute_path = Path(str(absolute_path)[:-overflow]) + relative_path = Path(str(relative_path)[:-overflow]) + return absolute_path, relative_path + def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]: ''' Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension. @@ -160,6 +184,21 @@ def f(file): with open(file_path, 'rb') as thefile: return thefile + +def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool: + custom_words = custom_words or [] + if any(word.lower() in url.lower() for word in custom_words): + info(f"Blacklisted word in {url}") + return True + + # Check content for profanity + profanity.load_censor_words(custom_words) + word_list = content.split() + content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word)) + content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0 + debug(f"Profanity ratio for {url}: {content_profanity_ratio}") + return content_profanity_ratio >= threshold + def get_extension(file): try: