Auto-update: Mon Aug 5 14:32:28 PDT 2024
This commit is contained in:
parent
27429df815
commit
ec21f92242
5 changed files with 166 additions and 27 deletions
11
missing-packages.txt
Normal file
11
missing-packages.txt
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
__future__
|
||||||
|
cv2
|
||||||
|
dateparser_data
|
||||||
|
ionic
|
||||||
|
json # Used for working with JSON data
|
||||||
|
llama_cpp
|
||||||
|
markdown_it
|
||||||
|
phonenumber_field
|
||||||
|
pptx
|
||||||
|
requests # Used for making HTTP requests
|
||||||
|
sijapi
|
|
@ -49,6 +49,83 @@ readability
|
||||||
trafilatura
|
trafilatura
|
||||||
urllib3
|
urllib3
|
||||||
anyio
|
anyio
|
||||||
semaphore
|
|
||||||
location
|
location
|
||||||
SRTM.py
|
SRTM.py
|
||||||
|
better_profanity
|
||||||
|
EventKit
|
||||||
|
Foundation
|
||||||
|
aiosqlite
|
||||||
|
anthropic
|
||||||
|
apscheduler
|
||||||
|
asgiref
|
||||||
|
aura_sr
|
||||||
|
authlib
|
||||||
|
backports.zoneinfo
|
||||||
|
boto3
|
||||||
|
click
|
||||||
|
colorama
|
||||||
|
contextvars
|
||||||
|
cron_descriptor
|
||||||
|
dateparser
|
||||||
|
deprecated
|
||||||
|
django
|
||||||
|
django_apscheduler
|
||||||
|
exa_py
|
||||||
|
factory
|
||||||
|
faker
|
||||||
|
faster_whisper
|
||||||
|
ffmpeg
|
||||||
|
fire
|
||||||
|
flupy
|
||||||
|
freezegun
|
||||||
|
google
|
||||||
|
huggingface_hub
|
||||||
|
jinja2
|
||||||
|
khoj
|
||||||
|
konlpy
|
||||||
|
langchain
|
||||||
|
langchain_community
|
||||||
|
libtmux
|
||||||
|
litellm
|
||||||
|
llama_index
|
||||||
|
lxml
|
||||||
|
magika
|
||||||
|
moviepy
|
||||||
|
neo4j
|
||||||
|
nest_asyncio
|
||||||
|
nltk
|
||||||
|
numpy
|
||||||
|
openpyxl
|
||||||
|
osmium
|
||||||
|
packaging
|
||||||
|
pgvector
|
||||||
|
posthog
|
||||||
|
psutil
|
||||||
|
psycopg2
|
||||||
|
pypdf
|
||||||
|
pytest
|
||||||
|
r2r
|
||||||
|
redis
|
||||||
|
resend
|
||||||
|
rich
|
||||||
|
schedule
|
||||||
|
semaphore
|
||||||
|
sentence_transformers
|
||||||
|
soundfile
|
||||||
|
spacy
|
||||||
|
sqlalchemy
|
||||||
|
stripe
|
||||||
|
tailscale
|
||||||
|
tenacity
|
||||||
|
tiktoken
|
||||||
|
torchaudio
|
||||||
|
transformers
|
||||||
|
twilio
|
||||||
|
typing_extensions
|
||||||
|
uvicorn
|
||||||
|
vecs
|
||||||
|
vectordb
|
||||||
|
websockets
|
||||||
|
whisper
|
||||||
|
whisperplus
|
||||||
|
youtube_dl
|
||||||
|
|
|
@ -27,6 +27,7 @@ SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255')
|
||||||
MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count())
|
MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count())
|
||||||
IMG = Configuration.load('img', 'secrets')
|
IMG = Configuration.load('img', 'secrets')
|
||||||
News = Configuration.load('news', 'secrets')
|
News = Configuration.load('news', 'secrets')
|
||||||
|
Archivist = Configuration.load('archivist')
|
||||||
Scrape = Configuration.load('scrape', 'secrets', Dir)
|
Scrape = Configuration.load('scrape', 'secrets', Dir)
|
||||||
Serve = Configuration.load('serve')
|
Serve = Configuration.load('serve')
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from datetime import datetime as dt_datetime, timedelta
|
from datetime import datetime as dt_datetime, timedelta
|
||||||
from typing import Optional
|
from typing import Optional, List, Tuple
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import newspaper
|
import newspaper
|
||||||
|
@ -19,12 +19,13 @@ import trafilatura
|
||||||
from newspaper import Article
|
from newspaper import Article
|
||||||
from readability import Document
|
from readability import Document
|
||||||
from markdownify import markdownify as md
|
from markdownify import markdownify as md
|
||||||
|
from better_profanity import profanity
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from urllib3.util.retry import Retry
|
from urllib3.util.retry import Retry
|
||||||
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
|
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sijapi import L, News, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
|
from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
|
||||||
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path
|
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
|
||||||
from sijapi.routers import gis, llm, tts, note
|
from sijapi.routers import gis, llm, tts, note
|
||||||
|
|
||||||
news = APIRouter()
|
news = APIRouter()
|
||||||
|
@ -329,12 +330,13 @@ async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]
|
||||||
return markdown_content
|
return markdown_content
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def process_archive(
|
async def process_archive(
|
||||||
url: str,
|
url: str,
|
||||||
title: Optional[str] = None,
|
title: Optional[str] = None,
|
||||||
encoding: str = 'utf-8',
|
encoding: str = 'utf-8',
|
||||||
source: Optional[str] = None,
|
source: Optional[str] = None,
|
||||||
) -> Path:
|
) -> Optional[Path]:
|
||||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||||
readable_title = title if title else f"{url} - {timestamp}"
|
readable_title = title if title else f"{url} - {timestamp}"
|
||||||
|
|
||||||
|
@ -342,16 +344,24 @@ async def process_archive(
|
||||||
if content is None:
|
if content is None:
|
||||||
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
|
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
|
||||||
|
|
||||||
markdown_path, relative_path = assemble_archive_path(readable_title, ".md")
|
if contains_profanity(url, content, 0.2, Archivist.blacklist):
|
||||||
|
info(f"Not archiving {url} due to profanity")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
|
||||||
|
except Exception as e:
|
||||||
|
warn(f"Failed to assemble archive path for {url}: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
markdown_content = f"---\n"
|
markdown_content = f"---\n"
|
||||||
markdown_content += f"title: {readable_title}\n"
|
markdown_content += f"title: \"{readable_title}\"\n"
|
||||||
markdown_content += f"added: {timestamp}\n"
|
markdown_content += f"added: {timestamp}\n"
|
||||||
markdown_content += f"url: {url}"
|
markdown_content += f"url: \"{url}\"\n"
|
||||||
markdown_content += f"date: {dt_datetime.now().strftime('%Y-%m-%d')}"
|
markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
|
||||||
markdown_content += f"---\n\n"
|
markdown_content += f"---\n\n"
|
||||||
markdown_content += f"# {readable_title}\n\n"
|
markdown_content += f"# {readable_title}\n\n"
|
||||||
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
|
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
|
||||||
markdown_content += content
|
markdown_content += content
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -365,6 +375,7 @@ async def process_archive(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, folder):
|
def download_file(url, folder):
|
||||||
os.makedirs(folder, exist_ok=True)
|
os.makedirs(folder, exist_ok=True)
|
||||||
filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
|
filename = str(uuid.uuid4()) + os.path.splitext(urlparse(url).path)[-1]
|
||||||
|
|
|
@ -12,11 +12,12 @@ from dateutil import parser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import filetype
|
import filetype
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
from better_profanity import profanity
|
||||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||||
import pytesseract
|
import pytesseract
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
from datetime import datetime, date, time
|
from datetime import datetime, date, time
|
||||||
from typing import Optional, Union, Tuple
|
from typing import Optional, Union, Tuple, List
|
||||||
import asyncio
|
import asyncio
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
@ -28,7 +29,7 @@ from sshtunnel import SSHTunnelForwarder
|
||||||
from fastapi import Depends, HTTPException, Request, UploadFile
|
from fastapi import Depends, HTTPException, Request, UploadFile
|
||||||
from fastapi.security.api_key import APIKeyHeader
|
from fastapi.security.api_key import APIKeyHeader
|
||||||
|
|
||||||
from sijapi import L, API, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
|
from sijapi import L, API, Archivist, YEAR_FMT, MONTH_FMT, DAY_FMT, DAY_SHORT_FMT, OBSIDIAN_VAULT_DIR, ALLOWED_FILENAME_CHARS, MAX_PATH_LENGTH, ARCHIVE_DIR
|
||||||
|
|
||||||
logger = L.get_module_logger('utilities')
|
logger = L.get_module_logger('utilities')
|
||||||
def debug(text: str): logger.debug(text)
|
def debug(text: str): logger.debug(text)
|
||||||
|
@ -63,35 +64,58 @@ def validate_api_key(request: Request, api_key: str = Depends(api_key_header)):
|
||||||
raise HTTPException(status_code=401, detail="Invalid or missing API key")
|
raise HTTPException(status_code=401, detail="Invalid or missing API key")
|
||||||
|
|
||||||
|
|
||||||
def assemble_archive_path(filename: str, extension: str = ".md", date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
|
def assemble_archive_path(filename: str, extension: str = None, date_time: datetime = datetime.now(), subdir: str = None) -> Tuple[Path, Path]:
|
||||||
year = date_time.strftime(YEAR_FMT)
|
year = date_time.strftime(YEAR_FMT)
|
||||||
month = date_time.strftime(MONTH_FMT)
|
month = date_time.strftime(MONTH_FMT)
|
||||||
day = date_time.strftime(DAY_FMT)
|
day = date_time.strftime(DAY_FMT)
|
||||||
day_short = date_time.strftime(DAY_SHORT_FMT)
|
day_short = date_time.strftime(DAY_SHORT_FMT)
|
||||||
timestamp = date_time.strftime("%H%M%S")
|
timestamp = date_time.strftime("%H%M%S")
|
||||||
|
|
||||||
# Ensure the extension is preserved
|
# Handle extension priority
|
||||||
base_name, ext = os.path.splitext(filename)
|
base_name, original_ext = os.path.splitext(filename)
|
||||||
extension = ext if ext else extension
|
|
||||||
|
if extension is not None:
|
||||||
|
# Use the provided extension parameter
|
||||||
|
final_extension = extension if extension.startswith('.') else f'.{extension}'
|
||||||
|
elif original_ext:
|
||||||
|
# Use the original file extension if present
|
||||||
|
final_extension = original_ext
|
||||||
|
else:
|
||||||
|
# Default to ".md" if no extension is provided or present
|
||||||
|
final_extension = ".md"
|
||||||
|
|
||||||
# Initial sanitization
|
# Initial sanitization
|
||||||
sanitized_base = sanitize_filename(base_name, '')
|
sanitized_base = sanitize_filename(base_name, '')
|
||||||
filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
|
filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}"
|
||||||
|
|
||||||
relative_path = Path(year) / month / day / filename
|
relative_path = Path(year) / month / day / filename
|
||||||
absolute_path = ARCHIVE_DIR / relative_path
|
absolute_path = Archivist.dir / relative_path
|
||||||
|
|
||||||
# Ensure the total path length doesn't exceed MAX_PATH_LENGTH
|
# Ensure the total path length doesn't exceed MAX_PATH_LENGTH
|
||||||
while len(str(absolute_path)) > MAX_PATH_LENGTH:
|
while len(str(absolute_path)) > MAX_PATH_LENGTH and len(sanitized_base) > 0:
|
||||||
# Truncate the sanitized_base, not the full filename
|
|
||||||
sanitized_base = sanitized_base[:-1]
|
sanitized_base = sanitized_base[:-1]
|
||||||
filename = f"{day_short} {timestamp} {sanitized_base}{extension}"
|
filename = f"{day_short} {timestamp} {sanitized_base}{final_extension}"
|
||||||
relative_path = Path(year) / month / day / filename
|
relative_path = Path(year) / month / day / filename
|
||||||
absolute_path = ARCHIVE_DIR / relative_path
|
absolute_path = ARCHIVE_DIR / relative_path
|
||||||
|
|
||||||
|
# If we've exhausted sanitized_base and the path is still too long
|
||||||
|
if len(str(absolute_path)) > MAX_PATH_LENGTH:
|
||||||
|
# Use a hash of the original filename to ensure uniqueness
|
||||||
|
hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8]
|
||||||
|
filename = f"{day_short} {timestamp} {hash_suffix}{final_extension}"
|
||||||
|
relative_path = Path(year) / month / day / filename
|
||||||
|
absolute_path = ARCHIVE_DIR / relative_path
|
||||||
|
|
||||||
|
# Final check and truncation if necessary
|
||||||
|
if len(str(absolute_path)) > MAX_PATH_LENGTH:
|
||||||
|
overflow = len(str(absolute_path)) - MAX_PATH_LENGTH
|
||||||
|
absolute_path = Path(str(absolute_path)[:-overflow])
|
||||||
|
relative_path = Path(str(relative_path)[:-overflow])
|
||||||
|
|
||||||
return absolute_path, relative_path
|
return absolute_path, relative_path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]:
|
def assemble_journal_path(date_time: datetime, subdir: str = None, filename: str = None, extension: str = None, no_timestamp: bool = False) -> Tuple[Path, Path]:
|
||||||
'''
|
'''
|
||||||
Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension.
|
Obsidian helper. Takes a datetime and optional subdirectory name, filename, and extension.
|
||||||
|
@ -161,6 +185,21 @@ def f(file):
|
||||||
return thefile
|
return thefile
|
||||||
|
|
||||||
|
|
||||||
|
def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
|
||||||
|
custom_words = custom_words or []
|
||||||
|
if any(word.lower() in url.lower() for word in custom_words):
|
||||||
|
info(f"Blacklisted word in {url}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check content for profanity
|
||||||
|
profanity.load_censor_words(custom_words)
|
||||||
|
word_list = content.split()
|
||||||
|
content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
|
||||||
|
content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
|
||||||
|
debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
|
||||||
|
return content_profanity_ratio >= threshold
|
||||||
|
|
||||||
|
|
||||||
def get_extension(file):
|
def get_extension(file):
|
||||||
try:
|
try:
|
||||||
if isinstance(file, str):
|
if isinstance(file, str):
|
||||||
|
|
Loading…
Reference in a new issue