Auto-update: Mon Aug 5 14:32:28 PDT 2024

This commit is contained in:
sanj 2024-08-05 14:32:28 -07:00
parent 27429df815
commit ec21f92242
5 changed files with 166 additions and 27 deletions

11
missing-packages.txt Normal file
View file

@ -0,0 +1,11 @@
__future__
cv2
dateparser_data
ionic
json # Used for working with JSON data
llama_cpp
markdown_it
phonenumber_field
pptx
requests # Used for making HTTP requests
sijapi

View file

@ -49,6 +49,83 @@ readability
trafilatura
urllib3
anyio
semaphore
location
SRTM.py
better_profanity
EventKit
Foundation
aiosqlite
anthropic
apscheduler
asgiref
aura_sr
authlib
backports.zoneinfo
boto3
click
colorama
contextvars
cron_descriptor
dateparser
deprecated
django
django_apscheduler
exa_py
factory
faker
faster_whisper
ffmpeg
fire
flupy
freezegun
google
huggingface_hub
jinja2
khoj
konlpy
langchain
langchain_community
libtmux
litellm
llama_index
lxml
magika
moviepy
neo4j
nest_asyncio
nltk
numpy
openpyxl
osmium
packaging
pgvector
posthog
psutil
psycopg2
pypdf
pytest
r2r
redis
resend
rich
schedule
semaphore
sentence_transformers
soundfile
spacy
sqlalchemy
stripe
tailscale
tenacity
tiktoken
torchaudio
transformers
twilio
typing_extensions
uvicorn
vecs
vectordb
websockets
whisper
whisperplus
youtube_dl

View file

@ -27,6 +27,7 @@ SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255')
MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count())
IMG = Configuration.load('img', 'secrets')
News = Configuration.load('news', 'secrets')
Archivist = Configuration.load('archivist')
Scrape = Configuration.load('scrape', 'secrets', Dir)
Serve = Configuration.load('serve')

View file

@ -11,7 +11,7 @@ import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime as dt_datetime, timedelta
from typing import Optional
from typing import Optional, List, Tuple
import aiohttp
import aiofiles
import newspaper
@ -19,12 +19,13 @@ import trafilatura
from newspaper import Article
from readability import Document
from markdownify import markdownify as md
from better_profanity import profanity
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
from pathlib import Path
from sijapi import L, News, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path
from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
from sijapi.routers import gis, llm, tts, note
news = APIRouter()
@ -329,12 +330,13 @@ async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]
return markdown_content
async def process_archive(
url: str,
title: Optional[str] = None,
encoding: str = 'utf-8',
source: Optional[str] = None,
) -> Path:
) -> Optional[Path]:
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = title if title else f"{url} - {timestamp}"
@ -342,16 +344,24 @@ async def process_archive(
if content is None:
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
if contains_profanity(url, content, 0.2, Archivist.blacklist):
info(f"Not archiving {url} due to profanity")
return None
try:
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
except Exception as e:
warn(f"Failed to assemble archive path for {url}: {str(e)}")
return None
markdown_content = f"---\n"
markdown_content += f"title: {readable_title}\n"
markdown_content += f"title: \"{readable_title}\"\n"
markdown_content += f"added: {timestamp}\n"
markdown_content += f"url: {url}"
markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
markdown_content += f"url: \"{url}\"\n"
markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n"
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
markdown_content += content
try:
@ -365,6 +375,7 @@ async def process_archive(
return None
def download_file(url, folder):
os.makedirs(folder, exist_ok=True)
filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]

View file

@ -12,11 +12,12 @@ from dateutil import parser
from pathlib import Path
import filetype
from PyPDF2 import PdfReader
from better_profanity import profanity
from pdfminer.high_level import extract_text as pdfminer_extract_text
import pytesseract
from pdf2image import convert_from_path
from datetime import datetime, date, time
from typing import Optional, Union, Tuple
from typing import Optional, Union, Tuple, List
import asyncio
from PIL import Image
import pandas as pd
@ -28,7 +29,7 @@ from sshtunnel import SSHTunnelForwarder
from fastapi import Depends, HTTPException, Request, UploadFile
from fastapi.security.api_key import APIKeyHeader
from sijapi import L, API, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
from sijapi import L, API, Archivist, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
logger = L.get_module_logger('utilities')
def debug(text: str): logger.debug(text)
@ -63,35 +64,58 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)):
raise HTTPException(status_code=401, detail="Invalid or missing API key")
def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
def assemble_archive_path(filename: str, extension: str = None, date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
year = date_time.strftime(YEAR_FMT)
month = date_time.strftime(MONTH_FMT)
day = date_time.strftime(DAY_FMT)
day_short = date_time.strftime(DAY_SHORT_FMT)
timestamp = date_time.strftime("%H%M%S")
# Ensure the extension is preserved
base_name, ext = os.path.splitext(filename)
extension = ext if ext else extension
# Handle extension priority
base_name, original_ext = os.path.splitext(filename)
if extension is not None:
# Use the provided extension parameter
final_extension = extension if extension.startswith('.') else f'.{extension}'
elif original_ext:
# Use the original file extension if present
final_extension = original_ext
else:
# Default to ".md" if no extension is provided or present
final_extension = ".md"
# Initial sanitization
sanitized_base = sanitize_filename(base_name, '')
filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}"
relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path
absolute_path = Archivist.dir / relative_path
# Ensure the total path length doesn't exceed MAX_PATH_LENGTH
while len(str(absolute_path)) > MAX_PATH_LENGTH:
# Truncate the sanitized_base, not the full filename
while len(str(absolute_path)) > MAX_PATH_LENGTH and len(sanitized_base) > 0:
sanitized_base = sanitized_base[:-1]
filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}"
relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path
# If we've exhausted sanitized_base and the path is still too long
if len(str(absolute_path)) > MAX_PATH_LENGTH:
# Use a hash of the original filename to ensure uniqueness
hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8]
filename = f"{day_short} {timestamp} {hash_suffix}{final_extension}"
relative_path = Path(year) / month / day / filename
absolute_path = ARCHIVE_DIR / relative_path
# Final check and truncation if necessary
if len(str(absolute_path)) > MAX_PATH_LENGTH:
overflow = len(str(absolute_path)) - MAX_PATH_LENGTH
absolute_path = Path(str(absolute_path)[:-overflow])
relative_path = Path(str(relative_path)[:-overflow])
return absolute_path, relative_path
def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]:
'''
Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension.
@ -161,6 +185,21 @@ def f(file):
return thefile
def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
custom_words = custom_words or []
if any(word.lower() in url.lower() for word in custom_words):
info(f"Blacklisted word in {url}")
return True
# Check content for profanity
profanity.load_censor_words(custom_words)
word_list = content.split()
content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
return content_profanity_ratio >= threshold
def get_extension(file):
try:
if isinstance(file, str):