Auto-update: Mon Aug 5 17:35:13 PDT 2024

2024-08-05 17:35:13 -07:00 · 2024-08-05 17:35:13 -07:00 · 72d3ba27b2
commit 72d3ba27b2
parent ec21f92242
8 changed files with 127741 additions and 94 deletions
--- a/sijapi/init.py
+++ b/sijapi/init.py
@ -40,6 +40,7 @@ os.makedirs(ALERTS_DIR, exist_ok=True)
 REQUESTS_DIR = LOGS_DIR / "requests"
 os.makedirs(REQUESTS_DIR, exist_ok=True)
 REQUESTS_LOG_PATH = LOGS_DIR / "requests.log"
+BLOCKLISTS_DIR = DATA_DIR / "blocklists"

 # LOCATE AND WEATHER LOCALIZATIONS
 USER_FULLNAME = os.getenv('USER_FULLNAME')
--- a/sijapi/config/api.yaml-example
+++ b/sijapi/config/api.yaml-example
@ -16,10 +16,11 @@ PUBLIC:

 TRUSTED_SUBNETS:
  - 127.0.0.1/32
-  - 10.0.0.0/24
-  - 192.168.0.0/24
+  - 10.13.37.0/24
+  - 100.64.64.0/24

 MODULES:
+  archivist: on
  asr: on
  cal: on
  cf: off
@ -84,13 +85,15 @@ POOL:
    conda_env: 'myenv'
  
 EXTENSIONS:
+  pgp: on
+  archivist: on
  courtlistener: off
  macnotify: on
  shellfish: on
+  
+TZ: "America/Los_Angeles"

-TZ: 'UTC'
-
-KEYS: ['{{ SECRET.GLOBAL_API_KEYS }}']
+KEYS: ["{{ SECRET.GLOBAL_API_KEYS }}"]

 GARBAGE:
  COLLECTION_INTERVAL: 60 * 60
--- a/sijapi/config/archivist.yaml-example
+++ b/sijapi/config/archivist.yaml-example
@ -0,0 +1,6 @@
+dir: "~/.private/archive/"
+blacklist:
+  - "http://10.64.64.10"
+  - "http://10.64.64.11"
+  - "blacklisted_word"
+  - "another_blacklisted_word"
--- a/sijapi/data/blocklists/easylist.txt
+++ b/sijapi/data/blocklists/easylist.txt
--- a/sijapi/data/blocklists/easyprivacy.txt
+++ b/sijapi/data/blocklists/easyprivacy.txt
--- a/sijapi/routers/archivist.py
+++ b/sijapi/routers/archivist.py
@ -0,0 +1,113 @@
+'''
+Used to archive sites visited with browser via the archivist.js UserScript.
+'''
+# routers/archivist.py
+
+from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
+import os
+import uuid
+import asyncio
+import shutil
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from datetime import datetime as dt_datetime, timedelta
+from typing import Optional, List, Tuple
+import aiohttp
+import aiofiles
+import newspaper
+import trafilatura
+from adblockparser import AdblockRules
+from urllib.parse import urlparse
+import logging
+from typing import Optional
+from pathlib import Path
+from newspaper import Article
+from readability import Document
+from markdownify import markdownify as md
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from datetime import datetime as dt_datetime
+from better_profanity import profanity
+from sijapi.classes import L, API, Archivist
+from sijapi.utilities import html_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker, initialize_adblock_rules, contains_blacklisted_word
+from sijapi import L, Archivist, BLOCKLISTS_DIR, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
+
+archivist = APIRouter()
+
+logger = L.get_module_logger("news")
+def debug(text: str): logger.debug(text)
+def info(text: str): logger.info(text)
+def warn(text: str): logger.warning(text)
+def err(text: str): logger.error(text)
+def crit(text: str): logger.critical(text)
+
+adblock_rules = initialize_adblock_rules(BLOCKLISTS_DIR)
+
+@archivist.post("/archive")
+async def archive_post(
+	url: Optional[str] = Form(None),
+	source: Optional[str] = Form(None),
+	title: Optional[str] = Form(None),
+	encoding: str = Form('utf-8')
+):
+	if not url:
+		warn(f"No URL provided to /archive endpoint.")
+		raise HTTPException(status_code=400, detail="URL is required")
+		
+	if is_ad_or_tracker(url, adblock_rules):
+		debug(f"Skipping likely ad or tracker URL: {url}")
+		raise HTTPException(status_code=400, detail="URL is likely an ad or tracker")
+	
+	markdown_filename = await process_archive(url, title, encoding, source)
+	return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
+
+async def process_archive(
+	url: str,
+	title: Optional[str] = None,
+	encoding: str = 'utf-8',
+	source: Optional[str] = None,
+) -> Optional[Path]:
+	
+	# Check URL against blacklist
+	if contains_blacklisted_word(url, Archivist.blacklist):
+		info(f"Not archiving {url} due to blacklisted word in URL")
+		return None
+	
+	timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
+	readable_title = title if title else f"{url} - {timestamp}"
+	
+	content = await html_to_markdown(url, source)
+	if content is None:
+		raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
+	
+	# Check content for profanity
+	if contains_profanity(content, threshold=0.01, custom_words=Archivist.blacklist):
+		info(f"Not archiving {url} due to profanity in content")
+		return None
+	
+	try:
+		markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
+	except Exception as e:
+		warn(f"Failed to assemble archive path for {url}: {str(e)}")
+		return None
+	
+	markdown_content = f"---\n"
+	markdown_content += f"title: \"{readable_title}\"\n"
+	markdown_content += f"added: {timestamp}\n"
+	markdown_content += f"url: \"{url}\"\n"
+	markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
+	markdown_content += f"---\n\n"
+	markdown_content += f"# {readable_title}\n\n"
+	markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
+	markdown_content += content
+	
+	try:
+		markdown_path.parent.mkdir(parents=True, exist_ok=True)
+		with open(markdown_path, 'w', encoding=encoding) as md_file:
+			md_file.write(markdown_content)
+		debug(f"Successfully saved to {markdown_path}")
+		return markdown_path
+	except Exception as e:
+		warn(f"Failed to write markdown file: {str(e)}")
+		return None
--- a/sijapi/routers/news.py
+++ b/sijapi/routers/news.py
@ -25,7 +25,7 @@ from urllib3.util.retry import Retry
 from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
 from pathlib import Path
 from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
-from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
+from sijapi.utilities import htmp_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker
 from sijapi.routers import gis, llm, tts, note

 news = APIRouter()
@ -179,6 +179,7 @@ async def download_and_save_article(article, site_name, earliest_date, bg_tasks:
        err(f"Error processing article from {article.url}: {str(e)}")
        return False

+
 async def process_news_site(site, bg_tasks: BackgroundTasks):
    info(f"Downloading articles from {site.name}...")
    
@ -251,15 +252,6 @@ async def clip_get(



-@news.post("/archive")
-async def archive_post(
-    url: Optional[str] = Form(None),
-    source: Optional[str] = Form(None),
-    title: Optional[str] = Form(None),
-    encoding: str = Form('utf-8')
-):
-    markdown_filename = await process_archive(url, title, encoding, source)
-    return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}


 async def parse_article(url: str, source: Optional[str] = None) -> Article:
@ -302,77 +294,6 @@ async def parse_article(url: str, source: Optional[str] = None) -> Article:
    return article


-async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
-    if source:
-        html_content = source
-    elif url:
-        async with aiohttp.ClientSession() as session:
-            async with session.get(url) as response:
-                html_content = await response.text()
-    else:
-        err(f"Unable to convert nothing to markdown.")
-        return None
-
-    # Use readability to extract the main content
-    doc = Document(html_content)
-    cleaned_html = doc.summary()
-
-    # Parse the cleaned HTML with BeautifulSoup for any additional processing
-    soup = BeautifulSoup(cleaned_html, 'html.parser')
-
-    # Remove any remaining unwanted elements
-    for element in soup(['script', 'style']):
-        element.decompose()
-
-    # Convert to markdown
-    markdown_content = md(str(soup), heading_style="ATX")
-
-    return markdown_content
-
-
-
-async def process_archive(
-    url: str,
-    title: Optional[str] = None,
-    encoding: str = 'utf-8',
-    source: Optional[str] = None,
-) -> Optional[Path]:
-    timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
-    readable_title = title if title else f"{url} - {timestamp}"
-    
-    content = await html_to_markdown(url, source)
-    if content is None:
-        raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
-    
-    if contains_profanity(url, content, 0.2, Archivist.blacklist):
-        info(f"Not archiving {url} due to profanity")
-        return None
-    
-    try:
-        markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
-    except Exception as e:
-        warn(f"Failed to assemble archive path for {url}: {str(e)}")
-        return None
-    
-    markdown_content = f"---\n"
-    markdown_content += f"title: \"{readable_title}\"\n"
-    markdown_content += f"added: {timestamp}\n"
-    markdown_content += f"url: \"{url}\"\n"
-    markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
-    markdown_content += f"---\n\n"
-    markdown_content += f"# {readable_title}\n\n"
-    markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
-    markdown_content += content
-    
-    try:
-        markdown_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(markdown_path, 'w', encoding=encoding) as md_file:
-            md_file.write(markdown_content)
-        debug(f"Successfully saved to {markdown_path}")
-        return markdown_path
-    except Exception as e:
-        warn(f"Failed to write markdown file: {str(e)}")
-        return None



--- a/sijapi/utilities.py
+++ b/sijapi/utilities.py
@ -13,6 +13,7 @@ from pathlib import Path
 import filetype
 from PyPDF2 import PdfReader
 from better_profanity import profanity
+from adblockparser import AdblockRules
 from pdfminer.high_level import extract_text as pdfminer_extract_text
 import pytesseract
 from pdf2image import convert_from_path
@ -184,22 +185,48 @@ def f(file):
    with open(file_path, 'rb') as thefile:
        return thefile

-            
-def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
-    custom_words = custom_words or []
-    if any(word.lower() in url.lower() for word in custom_words):
-        info(f"Blacklisted word in {url}")
-        return True

-    # Check content for profanity
+def is_ad_or_tracker(url: str, rules: AdblockRules) -> bool:
+    parsed_url = urlparse(url)
+    return rules.should_block(url, { 'domain': parsed_url.netloc })
+
+            
+def contains_blacklisted_word(text: str, blacklist: List[str]) -> bool:
+        return any(word.lower() in text.lower() for word in blacklist)
+    
+    
+def contains_profanity(content: str, threshold: float = 0.01, custom_words: Optional[List[str]] = None) -> bool:
+    custom_words = custom_words or []
+    
+    # Combine the profanity library's word list with custom words
    profanity.load_censor_words(custom_words)
+    
    word_list = content.split()
    content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
    content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
-    debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
+    
+    debug(f"Profanity ratio for content: {content_profanity_ratio}")
    return content_profanity_ratio >= threshold


+def load_filter_lists(blocklists_dir: Path):
+        rules = []
+        for file_path in blocklists_dir.glob('*.txt'):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    rules.extend(file.read().splitlines())
+                logging.info(f"Loaded blocklist: {file_path.name}")
+            except Exception as e:
+                logging.error(f"Error loading blocklist {file_path.name}: {str(e)}")
+        return rules
+    
+    
+def initialize_adblock_rules(blocklists_dir: Path):
+    rules = load_filter_lists(blocklists_dir)
+    logging.info(f"Initialized AdblockRules with {len(rules)} rules")
+    return AdblockRules(rules)
+
+
 def get_extension(file):
    try:
        if isinstance(file, str):
@ -519,3 +546,31 @@ async def run_ssh_command(server, command):
    except Exception as e:
        err(f"SSH command failed for server {server.id}: {str(e)}")
        raise
+
+
+async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
+        if source:
+            html_content = source
+        elif url:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url) as response:
+                    html_content = await response.text()
+        else:
+            err(f"Unable to convert nothing to markdown.")
+            return None
+        
+        # Use readability to extract the main content
+        doc = Document(html_content)
+        cleaned_html = doc.summary()
+        
+        # Parse the cleaned HTML with BeautifulSoup for any additional processing
+        soup = BeautifulSoup(cleaned_html, 'html.parser')
+        
+        # Remove any remaining unwanted elements
+        for element in soup(['script', 'style']):
+            element.decompose()
+        
+        # Convert to markdown
+        markdown_content = md(str(soup), heading_style="ATX")
+        
+        return markdown_content