Auto-update: Mon Aug 5 14:32:28 PDT 2024

This commit is contained in:
sanj 2024-08-05 14:32:28 -07:00
parent 27429df815
commit ec21f92242
5 changed files with 166 additions and 27 deletions

11
missing-packages.txt Normal file
View file

@ -0,0 +1,11 @@
__future__
cv2
dateparser_data
ionic
json # Used for working with JSON data
llama_cpp
markdown_it
phonenumber_field
pptx
requests # Used for making HTTP requests
sijapi

View file

@ -49,6 +49,83 @@ readability
trafilatura trafilatura
urllib3 urllib3
anyio anyio
semaphore
location location
SRTM.py SRTM.py
better_profanity
EventKit
Foundation
aiosqlite
anthropic
apscheduler
asgiref
aura_sr
authlib
backports.zoneinfo
boto3
click
colorama
contextvars
cron_descriptor
dateparser
deprecated
django
django_apscheduler
exa_py
factory
faker
faster_whisper
ffmpeg
fire
flupy
freezegun
google
huggingface_hub
jinja2
khoj
konlpy
langchain
langchain_community
libtmux
litellm
llama_index
lxml
magika
moviepy
neo4j
nest_asyncio
nltk
numpy
openpyxl
osmium
packaging
pgvector
posthog
psutil
psycopg2
pypdf
pytest
r2r
redis
resend
rich
schedule
semaphore
sentence_transformers
soundfile
spacy
sqlalchemy
stripe
tailscale
tenacity
tiktoken
torchaudio
transformers
twilio
typing_extensions
uvicorn
vecs
vectordb
websockets
whisper
whisperplus
youtube_dl

View file

@ -27,6 +27,7 @@ SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255')
MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count()) MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count())
IMG = Configuration.load('img', 'secrets') IMG = Configuration.load('img', 'secrets')
News = Configuration.load('news', 'secrets') News = Configuration.load('news', 'secrets')
Archivist = Configuration.load('archivist')
Scrape = Configuration.load('scrape', 'secrets', Dir) Scrape = Configuration.load('scrape', 'secrets', Dir)
Serve = Configuration.load('serve') Serve = Configuration.load('serve')

View file

@ -11,7 +11,7 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
from datetime import datetime as dt_datetime, timedelta from datetime import datetime as dt_datetime, timedelta
from typing import Optional from typing import Optional, List, Tuple
import aiohttp import aiohttp
import aiofiles import aiofiles
import newspaper import newspaper
@ -19,12 +19,13 @@ import trafilatura
from newspaper import Article from newspaper import Article
from readability import Document from readability import Document
from markdownify import markdownify as md from markdownify import markdownify as md
from better_profanity import profanity
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
from pathlib import Path from pathlib import Path
from sijapi import L, News, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
from sijapi.routers import gis, llm, tts, note from sijapi.routers import gis, llm, tts, note
news = APIRouter() news = APIRouter()
@ -329,12 +330,13 @@ async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]
return markdown_content return markdown_content
async def process_archive( async def process_archive(
url: str, url: str,
title: Optional[str] = None, title: Optional[str] = None,
encoding: str = 'utf-8', encoding: str = 'utf-8',
source: Optional[str] = None, source: Optional[str] = None,
) -> Path: ) -> Optional[Path]:
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M') timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = title if title else f"{url} - {timestamp}" readable_title = title if title else f"{url} - {timestamp}"
@ -342,16 +344,24 @@ async def process_archive(
if content is None: if content is None:
raise HTTPException(status_code=400, detail="Failed to convert content to markdown") raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
markdown_path, relative_path = assemble_archive_path(readable_title, ".md") if contains_profanity(url, content, 0.2, Archivist.blacklist):
info(f"Not archiving {url} due to profanity")
return None
try:
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
except Exception as e:
warn(f"Failed to assemble archive path for {url}: {str(e)}")
return None
markdown_content = f"---\n" markdown_content = f"---\n"
markdown_content += f"title: {readable_title}\n" markdown_content += f"title: \"{readable_title}\"\n"
markdown_content += f"added: {timestamp}\n" markdown_content += f"added: {timestamp}\n"
markdown_content += f"url: {url}" markdown_content += f"url: \"{url}\"\n"
markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}" markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
markdown_content += f"---\n\n" markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n" markdown_content += f"# {readable_title}\n\n"
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}" markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
markdown_content += content markdown_content += content
try: try:
@ -365,6 +375,7 @@ async def process_archive(
return None return None
def download_file(url, folder): def download_file(url, folder):
os.makedirs(folder, exist_ok=True) os.makedirs(folder, exist_ok=True)
filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1] filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]

View file

@ -12,11 +12,12 @@ from dateutil import parser
from pathlib import Path from pathlib import Path
import filetype import filetype
from PyPDF2 import PdfReader from PyPDF2 import PdfReader
from better_profanity import profanity
from pdfminer.high_level import extract_text as pdfminer_extract_text from pdfminer.high_level import extract_text as pdfminer_extract_text
import pytesseract import pytesseract
from pdf2image import convert_from_path from pdf2image import convert_from_path
from datetime import datetime, date, time from datetime import datetime, date, time
from typing import Optional, Union, Tuple from typing import Optional, Union, Tuple, List
import asyncio import asyncio
from PIL import Image from PIL import Image
import pandas as pd import pandas as pd
@ -28,7 +29,7 @@ from sshtunnel import SSHTunnelForwarder
from fastapi import Depends, HTTPException, Request, UploadFile from fastapi import Depends, HTTPException, Request, UploadFile
from fastapi.security.api_key import APIKeyHeader from fastapi.security.api_key import APIKeyHeader
from sijapi import L, API, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR from sijapi import L, API, Archivist, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
logger = L.get_module_logger('utilities') logger = L.get_module_logger('utilities')
def debug(text: str): logger.debug(text) def debug(text: str): logger.debug(text)
@ -63,35 +64,58 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)):
raise HTTPException(status_code=401, detail="Invalid or missing API key") raise HTTPException(status_code=401, detail="Invalid or missing API key")
def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]: def assemble_archive_path(filename: str, extension: str = None, date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
year = date_time.strftime(YEAR_FMT) year = date_time.strftime(YEAR_FMT)
month = date_time.strftime(MONTH_FMT) month = date_time.strftime(MONTH_FMT)
day = date_time.strftime(DAY_FMT) day = date_time.strftime(DAY_FMT)
day_short = date_time.strftime(DAY_SHORT_FMT) day_short = date_time.strftime(DAY_SHORT_FMT)
timestamp = date_time.strftime("%H%M%S") timestamp = date_time.strftime("%H%M%S")
# Ensure the extension is preserved # Handle extension priority
base_name, ext = os.path.splitext(filename) base_name, original_ext = os.path.splitext(filename)
extension = ext if ext else extension
if extension is not None:
# Use the provided extension parameter
final_extension = extension if extension.startswith('.') else f'.{extension}'
elif original_ext:
# Use the original file extension if present
final_extension = original_ext
else:
# Default to ".md" if no extension is provided or present
final_extension = ".md"
# Initial sanitization # Initial sanitization
sanitized_base = sanitize_filename(base_name, '') sanitized_base = sanitize_filename(base_name, '')
filename = f"{day_short} {timestamp} {sanitized_base}{extension}" filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}"
relative_path = Path(year) / month / day / filename relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path absolute_path = Archivist.dir / relative_path
# Ensure the total path length doesn't exceed MAX_PATH_LENGTH # Ensure the total path length doesn't exceed MAX_PATH_LENGTH
while len(str(absolute_path)) > MAX_PATH_LENGTH: while len(str(absolute_path)) > MAX_PATH_LENGTH and len(sanitized_base) > 0:
# Truncate the sanitized_base, not the full filename
sanitized_base = sanitized_base[:-1] sanitized_base = sanitized_base[:-1]
filename = f"{day_short} {timestamp} {sanitized_base}{extension}" filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}"
relative_path = Path(year) / month / day / filename relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path absolute_path = ARCHIVE_DIR / relative_path
# If we've exhausted sanitized_base and the path is still too long
if len(str(absolute_path)) > MAX_PATH_LENGTH:
# Use a hash of the original filename to ensure uniqueness
hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8]
filename = f"{day_short} {timestamp} {hash_suffix}{final_extension}"
relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path
# Final check and truncation if necessary
if len(str(absolute_path)) > MAX_PATH_LENGTH:
overflow = len(str(absolute_path)) - MAX_PATH_LENGTH
absolute_path = Path(str(absolute_path)[:-overflow])
relative_path = Path(str(relative_path)[:-overflow])
return absolute_path, relative_path return absolute_path, relative_path
def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]: def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]:
''' '''
Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension. Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension.
@ -161,6 +185,21 @@ def f(file):
return thefile return thefile
def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
custom_words = custom_words or []
if any(word.lower() in url.lower() for word in custom_words):
info(f"Blacklisted word in {url}")
return True
# Check content for profanity
profanity.load_censor_words(custom_words)
word_list = content.split()
content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
return content_profanity_ratio >= threshold
def get_extension(file): def get_extension(file):
try: try:
if isinstance(file, str): if isinstance(file, str):