Auto-update: Mon Aug 5 17:35:13 PDT 2024

2024-08-05 17:35:13 -07:00 · 2024-08-05 17:35:13 -07:00 · 72d3ba27b2
commit 72d3ba27b2
parent ec21f92242
8 changed files with 127741 additions and 94 deletions
--- a/sijapi/init.py
+++ b/sijapi/init.py
@ -40,6 +40,7 @@ os.makedirs(ALERTS_DIR, exist_ok=True)
 REQUESTS_DIR = LOGS_DIR / "requests"
 os.makedirs(REQUESTS_DIR, exist_ok=True)
 REQUESTS_LOG_PATH = LOGS_DIR / "requests.log"
 BLOCKLISTS_DIR = DATA_DIR / "blocklists"
 # LOCATE AND WEATHER LOCALIZATIONS
 USER_FULLNAME = os.getenv('USER_FULLNAME')
--- a/sijapi/config/api.yaml-example
+++ b/sijapi/config/api.yaml-example
@ -16,10 +16,11 @@ PUBLIC:
 TRUSTED_SUBNETS:
  - 127.0.0.1/32
-  - 10.0.0.0/24
+  - 10.13.37.0/24
-  - 192.168.0.0/24
+  - 100.64.64.0/24
 MODULES:
  archivist: on
  asr: on
  cal: on
  cf: off
@ -84,13 +85,15 @@ POOL:
    conda_env: 'myenv'
 EXTENSIONS:
  pgp: on
  archivist: on
  courtlistener: off
  macnotify: on
  shellfish: on
 TZ: "America/Los_Angeles"
-TZ: 'UTC'
+KEYS: ["{{ SECRET.GLOBAL_API_KEYS }}"]
 KEYS: ['{{ SECRET.GLOBAL_API_KEYS }}']
 GARBAGE:
  COLLECTION_INTERVAL: 60 * 60
--- a/sijapi/config/archivist.yaml-example
+++ b/sijapi/config/archivist.yaml-example
@ -0,0 +1,6 @@
 dir: "~/.private/archive/"
 blacklist:
  - "http://10.64.64.10"
  - "http://10.64.64.11"
  - "blacklisted_word"
  - "another_blacklisted_word"
--- a/sijapi/data/blocklists/easylist.txt
+++ b/sijapi/data/blocklists/easylist.txt
--- a/sijapi/data/blocklists/easyprivacy.txt
+++ b/sijapi/data/blocklists/easyprivacy.txt
--- a/sijapi/routers/archivist.py
+++ b/sijapi/routers/archivist.py
@ -0,0 +1,113 @@
 '''
 Used to archive sites visited with browser via the archivist.js UserScript.
 '''
 # routers/archivist.py
 from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
 import os
 import uuid
 import asyncio
 import shutil
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 from datetime import datetime as dt_datetime, timedelta
 from typing import Optional, List, Tuple
 import aiohttp
 import aiofiles
 import newspaper
 import trafilatura
 from adblockparser import AdblockRules
 from urllib.parse import urlparse
 import logging
 from typing import Optional
 from pathlib import Path
 from newspaper import Article
 from readability import Document
 from markdownify import markdownify as md
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 from datetime import datetime as dt_datetime
 from better_profanity import profanity
 from sijapi.classes import L, API, Archivist
 from sijapi.utilities import html_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker, initialize_adblock_rules, contains_blacklisted_word
 from sijapi import L, Archivist, BLOCKLISTS_DIR, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
 archivist = APIRouter()
 logger = L.get_module_logger("news")
 def debug(text: str): logger.debug(text)
 def info(text: str): logger.info(text)
 def warn(text: str): logger.warning(text)
 def err(text: str): logger.error(text)
 def crit(text: str): logger.critical(text)
 adblock_rules = initialize_adblock_rules(BLOCKLISTS_DIR)
@archivist.post("/archive")
 async def archive_post(
 	url: Optional[str] = Form(None),
 	source: Optional[str] = Form(None),
 	title: Optional[str] = Form(None),
 	encoding: str = Form('utf-8')
 ):
 	if not url:
 		warn(f"No URL provided to /archive endpoint.")
 		raise HTTPException(status_code=400, detail="URL is required")
 	if is_ad_or_tracker(url, adblock_rules):
 		debug(f"Skipping likely ad or tracker URL: {url}")
 		raise HTTPException(status_code=400, detail="URL is likely an ad or tracker")
 	markdown_filename = await process_archive(url, title, encoding, source)
 	return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
 async def process_archive(
 	url: str,
 	title: Optional[str] = None,
 	encoding: str = 'utf-8',
 	source: Optional[str] = None,
 ) -> Optional[Path]:
 	# Check URL against blacklist
 	if contains_blacklisted_word(url, Archivist.blacklist):
 		info(f"Not archiving {url} due to blacklisted word in URL")
 		return None
 	timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
 	readable_title = title if title else f"{url} - {timestamp}"
 	content = await html_to_markdown(url, source)
 	if content is None:
 		raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
 	# Check content for profanity
 	if contains_profanity(content, threshold=0.01, custom_words=Archivist.blacklist):
 		info(f"Not archiving {url} due to profanity in content")
 		return None
 	try:
 		markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
 	except Exception as e:
 		warn(f"Failed to assemble archive path for {url}: {str(e)}")
 		return None
 	markdown_content = f"---\n"
 	markdown_content += f"title: \"{readable_title}\"\n"
 	markdown_content += f"added: {timestamp}\n"
 	markdown_content += f"url: \"{url}\"\n"
 	markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
 	markdown_content += f"---\n\n"
 	markdown_content += f"# {readable_title}\n\n"
 	markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
 	markdown_content += content
 	try:
 		markdown_path.parent.mkdir(parents=True, exist_ok=True)
 		with open(markdown_path, 'w', encoding=encoding) as md_file:
 			md_file.write(markdown_content)
 		debug(f"Successfully saved to {markdown_path}")
 		return markdown_path
 	except Exception as e:
 		warn(f"Failed to write markdown file: {str(e)}")
 		return None
--- a/sijapi/routers/news.py
+++ b/sijapi/routers/news.py
@ -25,7 +25,7 @@ from urllib3.util.retry import Retry
 from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
 from pathlib import Path
 from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
-from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
+from sijapi.utilities import htmp_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker
 from sijapi.routers import gis, llm, tts, note
 news = APIRouter()
@ -179,6 +179,7 @@ async def download_and_save_article(article, site_name, earliest_date, bg_tasks:
        err(f"Error processing article from {article.url}: {str(e)}")
        return False
 async def process_news_site(site, bg_tasks: BackgroundTasks):
    info(f"Downloading articles from {site.name}...")
@ -251,15 +252,6 @@ async def clip_get(
@news.post("/archive")
 async def archive_post(
    url: Optional[str] = Form(None),
    source: Optional[str] = Form(None),
    title: Optional[str] = Form(None),
    encoding: str = Form('utf-8')
 ):
    markdown_filename = await process_archive(url, title, encoding, source)
    return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
 async def parse_article(url: str, source: Optional[str] = None) -> Article:
@ -302,77 +294,6 @@ async def parse_article(url: str, source: Optional[str] = None) -> Article:
    return article
 async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
    if source:
        html_content = source
    elif url:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                html_content = await response.text()
    else:
        err(f"Unable to convert nothing to markdown.")
        return None
    # Use readability to extract the main content
    doc = Document(html_content)
    cleaned_html = doc.summary()
    # Parse the cleaned HTML with BeautifulSoup for any additional processing
    soup = BeautifulSoup(cleaned_html, 'html.parser')
    # Remove any remaining unwanted elements
    for element in soup(['script', 'style']):
        element.decompose()
    # Convert to markdown
    markdown_content = md(str(soup), heading_style="ATX")
    return markdown_content
 async def process_archive(
    url: str,
    title: Optional[str] = None,
    encoding: str = 'utf-8',
    source: Optional[str] = None,
 ) -> Optional[Path]:
    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
    readable_title = title if title else f"{url} - {timestamp}"
    content = await html_to_markdown(url, source)
    if content is None:
        raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
    if contains_profanity(url, content, 0.2, Archivist.blacklist):
        info(f"Not archiving {url} due to profanity")
        return None
    try:
        markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
    except Exception as e:
        warn(f"Failed to assemble archive path for {url}: {str(e)}")
        return None
    markdown_content = f"---\n"
    markdown_content += f"title: \"{readable_title}\"\n"
    markdown_content += f"added: {timestamp}\n"
    markdown_content += f"url: \"{url}\"\n"
    markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
    markdown_content += f"---\n\n"
    markdown_content += f"# {readable_title}\n\n"
    markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
    markdown_content += content
    try:
        markdown_path.parent.mkdir(parents=True, exist_ok=True)
        with open(markdown_path, 'w', encoding=encoding) as md_file:
            md_file.write(markdown_content)
        debug(f"Successfully saved to {markdown_path}")
        return markdown_path
    except Exception as e:
        warn(f"Failed to write markdown file: {str(e)}")
        return None
--- a/sijapi/utilities.py
+++ b/sijapi/utilities.py
@ -13,6 +13,7 @@ from pathlib import Path
 import filetype
 from PyPDF2 import PdfReader
 from better_profanity import profanity
 from adblockparser import AdblockRules
 from pdfminer.high_level import extract_text as pdfminer_extract_text
 import pytesseract
 from pdf2image import convert_from_path
@ -184,22 +185,48 @@ def f(file):
    with open(file_path, 'rb') as thefile:
        return thefile
 def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
    custom_words = custom_words or []
    if any(word.lower() in url.lower() for word in custom_words):
        info(f"Blacklisted word in {url}")
        return True
-    # Check content for profanity
+def is_ad_or_tracker(url: str, rules: AdblockRules) -> bool:
    parsed_url = urlparse(url)
    return rules.should_block(url, { 'domain': parsed_url.netloc })
 def contains_blacklisted_word(text: str, blacklist: List[str]) -> bool:
        return any(word.lower() in text.lower() for word in blacklist)
 def contains_profanity(content: str, threshold: float = 0.01, custom_words: Optional[List[str]] = None) -> bool:
    custom_words = custom_words or []
    # Combine the profanity library's word list with custom words
    profanity.load_censor_words(custom_words)
    word_list = content.split()
    content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
    content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
-    debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
+    
    debug(f"Profanity ratio for content: {content_profanity_ratio}")
    return content_profanity_ratio >= threshold
 def load_filter_lists(blocklists_dir: Path):
        rules = []
        for file_path in blocklists_dir.glob('*.txt'):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    rules.extend(file.read().splitlines())
                logging.info(f"Loaded blocklist: {file_path.name}")
            except Exception as e:
                logging.error(f"Error loading blocklist {file_path.name}: {str(e)}")
        return rules
 def initialize_adblock_rules(blocklists_dir: Path):
    rules = load_filter_lists(blocklists_dir)
    logging.info(f"Initialized AdblockRules with {len(rules)} rules")
    return AdblockRules(rules)
 def get_extension(file):
    try:
        if isinstance(file, str):
@ -519,3 +546,31 @@ async def run_ssh_command(server, command):
    except Exception as e:
        err(f"SSH command failed for server {server.id}: {str(e)}")
        raise
 async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
        if source:
            html_content = source
        elif url:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    html_content = await response.text()
        else:
            err(f"Unable to convert nothing to markdown.")
            return None
        # Use readability to extract the main content
        doc = Document(html_content)
        cleaned_html = doc.summary()
        # Parse the cleaned HTML with BeautifulSoup for any additional processing
        soup = BeautifulSoup(cleaned_html, 'html.parser')
        # Remove any remaining unwanted elements
        for element in soup(['script', 'style']):
            element.decompose()
        # Convert to markdown
        markdown_content = md(str(soup), heading_style="ATX")
        return markdown_content