Auto-update: Mon Aug 5 17:35:13 PDT 2024

This commit is contained in:
sanj 2024-08-05 17:35:13 -07:00
parent ec21f92242
commit 72d3ba27b2
8 changed files with 127741 additions and 94 deletions

View file

@ -40,6 +40,7 @@ os.makedirs(ALERTS_DIR, exist_ok=True)
REQUESTS_DIR = LOGS_DIR / "requests"
os.makedirs(REQUESTS_DIR, exist_ok=True)
REQUESTS_LOG_PATH = LOGS_DIR / "requests.log"
BLOCKLISTS_DIR = DATA_DIR / "blocklists"
# LOCATE AND WEATHER LOCALIZATIONS
USER_FULLNAME = os.getenv('USER_FULLNAME')

View file

@ -16,10 +16,11 @@ PUBLIC:
TRUSTED_SUBNETS:
- 127.0.0.1/32
- 10.0.0.0/24
- 192.168.0.0/24
- 10.13.37.0/24
- 100.64.64.0/24
MODULES:
archivist: on
asr: on
cal: on
cf: off
@ -84,13 +85,15 @@ POOL:
conda_env: 'myenv'
EXTENSIONS:
pgp: on
archivist: on
courtlistener: off
macnotify: on
shellfish: on
TZ: 'UTC'
TZ: "America/Los_Angeles"
KEYS: ['{{ SECRET.GLOBAL_API_KEYS }}']
KEYS: ["{{ SECRET.GLOBAL_API_KEYS }}"]
GARBAGE:
COLLECTION_INTERVAL: 60 * 60

View file

@ -0,0 +1,6 @@
dir: "~/.private/archive/"
blacklist:
- "http://10.64.64.10"
- "http://10.64.64.11"
- "blacklisted_word"
- "another_blacklisted_word"

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

113
sijapi/routers/archivist.py Normal file
View file

@ -0,0 +1,113 @@
'''
Used to archive sites visited with browser via the archivist.js UserScript.
'''
# routers/archivist.py
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
import os
import uuid
import asyncio
import shutil
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime as dt_datetime, timedelta
from typing import Optional, List, Tuple
import aiohttp
import aiofiles
import newspaper
import trafilatura
from adblockparser import AdblockRules
from urllib.parse import urlparse
import logging
from typing import Optional
from pathlib import Path
from newspaper import Article
from readability import Document
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from datetime import datetime as dt_datetime
from better_profanity import profanity
from sijapi.classes import L, API, Archivist
from sijapi.utilities import html_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker, initialize_adblock_rules, contains_blacklisted_word
from sijapi import L, Archivist, BLOCKLISTS_DIR, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
archivist = APIRouter()
logger = L.get_module_logger("news")
def debug(text: str): logger.debug(text)
def info(text: str): logger.info(text)
def warn(text: str): logger.warning(text)
def err(text: str): logger.error(text)
def crit(text: str): logger.critical(text)
adblock_rules = initialize_adblock_rules(BLOCKLISTS_DIR)
@archivist.post("/archive")
async def archive_post(
url: Optional[str] = Form(None),
source: Optional[str] = Form(None),
title: Optional[str] = Form(None),
encoding: str = Form('utf-8')
):
if not url:
warn(f"No URL provided to /archive endpoint.")
raise HTTPException(status_code=400, detail="URL is required")
if is_ad_or_tracker(url, adblock_rules):
debug(f"Skipping likely ad or tracker URL: {url}")
raise HTTPException(status_code=400, detail="URL is likely an ad or tracker")
markdown_filename = await process_archive(url, title, encoding, source)
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
async def process_archive(
url: str,
title: Optional[str] = None,
encoding: str = 'utf-8',
source: Optional[str] = None,
) -> Optional[Path]:
# Check URL against blacklist
if contains_blacklisted_word(url, Archivist.blacklist):
info(f"Not archiving {url} due to blacklisted word in URL")
return None
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = title if title else f"{url} - {timestamp}"
content = await html_to_markdown(url, source)
if content is None:
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
# Check content for profanity
if contains_profanity(content, threshold=0.01, custom_words=Archivist.blacklist):
info(f"Not archiving {url} due to profanity in content")
return None
try:
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
except Exception as e:
warn(f"Failed to assemble archive path for {url}: {str(e)}")
return None
markdown_content = f"---\n"
markdown_content += f"title: \"{readable_title}\"\n"
markdown_content += f"added: {timestamp}\n"
markdown_content += f"url: \"{url}\"\n"
markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n"
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
markdown_content += content
try:
markdown_path.parent.mkdir(parents=True, exist_ok=True)
with open(markdown_path, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
debug(f"Successfully saved to {markdown_path}")
return markdown_path
except Exception as e:
warn(f"Failed to write markdown file: {str(e)}")
return None

View file

@ -25,7 +25,7 @@ from urllib3.util.retry import Retry
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
from pathlib import Path
from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
from sijapi.utilities import htmp_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker
from sijapi.routers import gis, llm, tts, note
news = APIRouter()
@ -179,6 +179,7 @@ async def download_and_save_article(article, site_name, earliest_date, bg_tasks:
err(f"Error processing article from {article.url}: {str(e)}")
return False
async def process_news_site(site, bg_tasks: BackgroundTasks):
info(f"Downloading articles from {site.name}...")
@ -251,15 +252,6 @@ async def clip_get(
@news.post("/archive")
async def archive_post(
url: Optional[str] = Form(None),
source: Optional[str] = Form(None),
title: Optional[str] = Form(None),
encoding: str = Form('utf-8')
):
markdown_filename = await process_archive(url, title, encoding, source)
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
async def parse_article(url: str, source: Optional[str] = None) -> Article:
@ -302,77 +294,6 @@ async def parse_article(url: str, source: Optional[str] = None) -> Article:
return article
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
if source:
html_content = source
elif url:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html_content = await response.text()
else:
err(f"Unable to convert nothing to markdown.")
return None
# Use readability to extract the main content
doc = Document(html_content)
cleaned_html = doc.summary()
# Parse the cleaned HTML with BeautifulSoup for any additional processing
soup = BeautifulSoup(cleaned_html, 'html.parser')
# Remove any remaining unwanted elements
for element in soup(['script', 'style']):
element.decompose()
# Convert to markdown
markdown_content = md(str(soup), heading_style="ATX")
return markdown_content
async def process_archive(
url: str,
title: Optional[str] = None,
encoding: str = 'utf-8',
source: Optional[str] = None,
) -> Optional[Path]:
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
readable_title = title if title else f"{url} - {timestamp}"
content = await html_to_markdown(url, source)
if content is None:
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
if contains_profanity(url, content, 0.2, Archivist.blacklist):
info(f"Not archiving {url} due to profanity")
return None
try:
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
except Exception as e:
warn(f"Failed to assemble archive path for {url}: {str(e)}")
return None
markdown_content = f"---\n"
markdown_content += f"title: \"{readable_title}\"\n"
markdown_content += f"added: {timestamp}\n"
markdown_content += f"url: \"{url}\"\n"
markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n"
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
markdown_content += content
try:
markdown_path.parent.mkdir(parents=True, exist_ok=True)
with open(markdown_path, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content)
debug(f"Successfully saved to {markdown_path}")
return markdown_path
except Exception as e:
warn(f"Failed to write markdown file: {str(e)}")
return None

View file

@ -13,6 +13,7 @@ from pathlib import Path
import filetype
from PyPDF2 import PdfReader
from better_profanity import profanity
from adblockparser import AdblockRules
from pdfminer.high_level import extract_text as pdfminer_extract_text
import pytesseract
from pdf2image import convert_from_path
@ -185,21 +186,47 @@ def f(file):
return thefile
def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
custom_words = custom_words or []
if any(word.lower() in url.lower() for word in custom_words):
info(f"Blacklisted word in {url}")
return True
def is_ad_or_tracker(url: str, rules: AdblockRules) -> bool:
parsed_url = urlparse(url)
return rules.should_block(url, { 'domain': parsed_url.netloc })
# Check content for profanity
def contains_blacklisted_word(text: str, blacklist: List[str]) -> bool:
return any(word.lower() in text.lower() for word in blacklist)
def contains_profanity(content: str, threshold: float = 0.01, custom_words: Optional[List[str]] = None) -> bool:
custom_words = custom_words or []
# Combine the profanity library's word list with custom words
profanity.load_censor_words(custom_words)
word_list = content.split()
content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
debug(f"Profanity ratio for content: {content_profanity_ratio}")
return content_profanity_ratio >= threshold
def load_filter_lists(blocklists_dir: Path):
rules = []
for file_path in blocklists_dir.glob('*.txt'):
try:
with open(file_path, 'r', encoding='utf-8') as file:
rules.extend(file.read().splitlines())
logging.info(f"Loaded blocklist: {file_path.name}")
except Exception as e:
logging.error(f"Error loading blocklist {file_path.name}: {str(e)}")
return rules
def initialize_adblock_rules(blocklists_dir: Path):
rules = load_filter_lists(blocklists_dir)
logging.info(f"Initialized AdblockRules with {len(rules)} rules")
return AdblockRules(rules)
def get_extension(file):
try:
if isinstance(file, str):
@ -519,3 +546,31 @@ async def run_ssh_command(server, command):
except Exception as e:
err(f"SSH command failed for server {server.id}: {str(e)}")
raise
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
if source:
html_content = source
elif url:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html_content = await response.text()
else:
err(f"Unable to convert nothing to markdown.")
return None
# Use readability to extract the main content
doc = Document(html_content)
cleaned_html = doc.summary()
# Parse the cleaned HTML with BeautifulSoup for any additional processing
soup = BeautifulSoup(cleaned_html, 'html.parser')
# Remove any remaining unwanted elements
for element in soup(['script', 'style']):
element.decompose()
# Convert to markdown
markdown_content = md(str(soup), heading_style="ATX")
return markdown_content