Auto-update: Mon Aug 5 17:35:13 PDT 2024
This commit is contained in:
parent
ec21f92242
commit
72d3ba27b2
8 changed files with 127741 additions and 94 deletions
|
@ -40,6 +40,7 @@ os.makedirs(ALERTS_DIR, exist_ok=True)
|
||||||
REQUESTS_DIR = LOGS_DIR / "requests"
|
REQUESTS_DIR = LOGS_DIR / "requests"
|
||||||
os.makedirs(REQUESTS_DIR, exist_ok=True)
|
os.makedirs(REQUESTS_DIR, exist_ok=True)
|
||||||
REQUESTS_LOG_PATH = LOGS_DIR / "requests.log"
|
REQUESTS_LOG_PATH = LOGS_DIR / "requests.log"
|
||||||
|
BLOCKLISTS_DIR = DATA_DIR / "blocklists"
|
||||||
|
|
||||||
# LOCATE AND WEATHER LOCALIZATIONS
|
# LOCATE AND WEATHER LOCALIZATIONS
|
||||||
USER_FULLNAME = os.getenv('USER_FULLNAME')
|
USER_FULLNAME = os.getenv('USER_FULLNAME')
|
||||||
|
|
|
@ -16,10 +16,11 @@ PUBLIC:
|
||||||
|
|
||||||
TRUSTED_SUBNETS:
|
TRUSTED_SUBNETS:
|
||||||
- 127.0.0.1/32
|
- 127.0.0.1/32
|
||||||
- 10.0.0.0/24
|
- 10.13.37.0/24
|
||||||
- 192.168.0.0/24
|
- 100.64.64.0/24
|
||||||
|
|
||||||
MODULES:
|
MODULES:
|
||||||
|
archivist: on
|
||||||
asr: on
|
asr: on
|
||||||
cal: on
|
cal: on
|
||||||
cf: off
|
cf: off
|
||||||
|
@ -84,13 +85,15 @@ POOL:
|
||||||
conda_env: 'myenv'
|
conda_env: 'myenv'
|
||||||
|
|
||||||
EXTENSIONS:
|
EXTENSIONS:
|
||||||
|
pgp: on
|
||||||
|
archivist: on
|
||||||
courtlistener: off
|
courtlistener: off
|
||||||
macnotify: on
|
macnotify: on
|
||||||
shellfish: on
|
shellfish: on
|
||||||
|
|
||||||
|
TZ: "America/Los_Angeles"
|
||||||
|
|
||||||
TZ: 'UTC'
|
KEYS: ["{{ SECRET.GLOBAL_API_KEYS }}"]
|
||||||
|
|
||||||
KEYS: ['{{ SECRET.GLOBAL_API_KEYS }}']
|
|
||||||
|
|
||||||
GARBAGE:
|
GARBAGE:
|
||||||
COLLECTION_INTERVAL: 60 * 60
|
COLLECTION_INTERVAL: 60 * 60
|
||||||
|
|
6
sijapi/config/archivist.yaml-example
Normal file
6
sijapi/config/archivist.yaml-example
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
dir: "~/.private/archive/"
|
||||||
|
blacklist:
|
||||||
|
- "http://10.64.64.10"
|
||||||
|
- "http://10.64.64.11"
|
||||||
|
- "blacklisted_word"
|
||||||
|
- "another_blacklisted_word"
|
74173
sijapi/data/blocklists/easylist.txt
Normal file
74173
sijapi/data/blocklists/easylist.txt
Normal file
File diff suppressed because it is too large
Load diff
53375
sijapi/data/blocklists/easyprivacy.txt
Normal file
53375
sijapi/data/blocklists/easyprivacy.txt
Normal file
File diff suppressed because it is too large
Load diff
113
sijapi/routers/archivist.py
Normal file
113
sijapi/routers/archivist.py
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
'''
|
||||||
|
Used to archive sites visited with browser via the archivist.js UserScript.
|
||||||
|
'''
|
||||||
|
# routers/archivist.py
|
||||||
|
|
||||||
|
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
import asyncio
|
||||||
|
import shutil
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from datetime import datetime as dt_datetime, timedelta
|
||||||
|
from typing import Optional, List, Tuple
|
||||||
|
import aiohttp
|
||||||
|
import aiofiles
|
||||||
|
import newspaper
|
||||||
|
import trafilatura
|
||||||
|
from adblockparser import AdblockRules
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from pathlib import Path
|
||||||
|
from newspaper import Article
|
||||||
|
from readability import Document
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
from datetime import datetime as dt_datetime
|
||||||
|
from better_profanity import profanity
|
||||||
|
from sijapi.classes import L, API, Archivist
|
||||||
|
from sijapi.utilities import html_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker, initialize_adblock_rules, contains_blacklisted_word
|
||||||
|
from sijapi import L, Archivist, BLOCKLISTS_DIR, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
|
||||||
|
|
||||||
|
archivist = APIRouter()
|
||||||
|
|
||||||
|
logger = L.get_module_logger("news")
|
||||||
|
def debug(text: str): logger.debug(text)
|
||||||
|
def info(text: str): logger.info(text)
|
||||||
|
def warn(text: str): logger.warning(text)
|
||||||
|
def err(text: str): logger.error(text)
|
||||||
|
def crit(text: str): logger.critical(text)
|
||||||
|
|
||||||
|
adblock_rules = initialize_adblock_rules(BLOCKLISTS_DIR)
|
||||||
|
|
||||||
|
@archivist.post("/archive")
|
||||||
|
async def archive_post(
|
||||||
|
url: Optional[str] = Form(None),
|
||||||
|
source: Optional[str] = Form(None),
|
||||||
|
title: Optional[str] = Form(None),
|
||||||
|
encoding: str = Form('utf-8')
|
||||||
|
):
|
||||||
|
if not url:
|
||||||
|
warn(f"No URL provided to /archive endpoint.")
|
||||||
|
raise HTTPException(status_code=400, detail="URL is required")
|
||||||
|
|
||||||
|
if is_ad_or_tracker(url, adblock_rules):
|
||||||
|
debug(f"Skipping likely ad or tracker URL: {url}")
|
||||||
|
raise HTTPException(status_code=400, detail="URL is likely an ad or tracker")
|
||||||
|
|
||||||
|
markdown_filename = await process_archive(url, title, encoding, source)
|
||||||
|
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||||
|
|
||||||
|
async def process_archive(
|
||||||
|
url: str,
|
||||||
|
title: Optional[str] = None,
|
||||||
|
encoding: str = 'utf-8',
|
||||||
|
source: Optional[str] = None,
|
||||||
|
) -> Optional[Path]:
|
||||||
|
|
||||||
|
# Check URL against blacklist
|
||||||
|
if contains_blacklisted_word(url, Archivist.blacklist):
|
||||||
|
info(f"Not archiving {url} due to blacklisted word in URL")
|
||||||
|
return None
|
||||||
|
|
||||||
|
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||||
|
readable_title = title if title else f"{url} - {timestamp}"
|
||||||
|
|
||||||
|
content = await html_to_markdown(url, source)
|
||||||
|
if content is None:
|
||||||
|
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
|
||||||
|
|
||||||
|
# Check content for profanity
|
||||||
|
if contains_profanity(content, threshold=0.01, custom_words=Archivist.blacklist):
|
||||||
|
info(f"Not archiving {url} due to profanity in content")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
|
||||||
|
except Exception as e:
|
||||||
|
warn(f"Failed to assemble archive path for {url}: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
markdown_content = f"---\n"
|
||||||
|
markdown_content += f"title: \"{readable_title}\"\n"
|
||||||
|
markdown_content += f"added: {timestamp}\n"
|
||||||
|
markdown_content += f"url: \"{url}\"\n"
|
||||||
|
markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
|
||||||
|
markdown_content += f"---\n\n"
|
||||||
|
markdown_content += f"# {readable_title}\n\n"
|
||||||
|
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
|
||||||
|
markdown_content += content
|
||||||
|
|
||||||
|
try:
|
||||||
|
markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(markdown_path, 'w', encoding=encoding) as md_file:
|
||||||
|
md_file.write(markdown_content)
|
||||||
|
debug(f"Successfully saved to {markdown_path}")
|
||||||
|
return markdown_path
|
||||||
|
except Exception as e:
|
||||||
|
warn(f"Failed to write markdown file: {str(e)}")
|
||||||
|
return None
|
|
@ -25,7 +25,7 @@ from urllib3.util.retry import Retry
|
||||||
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
|
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
|
from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
|
||||||
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
|
from sijapi.utilities import htmp_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker
|
||||||
from sijapi.routers import gis, llm, tts, note
|
from sijapi.routers import gis, llm, tts, note
|
||||||
|
|
||||||
news = APIRouter()
|
news = APIRouter()
|
||||||
|
@ -179,6 +179,7 @@ async def download_and_save_article(article, site_name, earliest_date, bg_tasks:
|
||||||
err(f"Error processing article from {article.url}: {str(e)}")
|
err(f"Error processing article from {article.url}: {str(e)}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def process_news_site(site, bg_tasks: BackgroundTasks):
|
async def process_news_site(site, bg_tasks: BackgroundTasks):
|
||||||
info(f"Downloading articles from {site.name}...")
|
info(f"Downloading articles from {site.name}...")
|
||||||
|
|
||||||
|
@ -251,15 +252,6 @@ async def clip_get(
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@news.post("/archive")
|
|
||||||
async def archive_post(
|
|
||||||
url: Optional[str] = Form(None),
|
|
||||||
source: Optional[str] = Form(None),
|
|
||||||
title: Optional[str] = Form(None),
|
|
||||||
encoding: str = Form('utf-8')
|
|
||||||
):
|
|
||||||
markdown_filename = await process_archive(url, title, encoding, source)
|
|
||||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
|
||||||
|
|
||||||
|
|
||||||
async def parse_article(url: str, source: Optional[str] = None) -> Article:
|
async def parse_article(url: str, source: Optional[str] = None) -> Article:
|
||||||
|
@ -302,77 +294,6 @@ async def parse_article(url: str, source: Optional[str] = None) -> Article:
|
||||||
return article
|
return article
|
||||||
|
|
||||||
|
|
||||||
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
|
|
||||||
if source:
|
|
||||||
html_content = source
|
|
||||||
elif url:
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
async with session.get(url) as response:
|
|
||||||
html_content = await response.text()
|
|
||||||
else:
|
|
||||||
err(f"Unable to convert nothing to markdown.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Use readability to extract the main content
|
|
||||||
doc = Document(html_content)
|
|
||||||
cleaned_html = doc.summary()
|
|
||||||
|
|
||||||
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
|
||||||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
|
||||||
|
|
||||||
# Remove any remaining unwanted elements
|
|
||||||
for element in soup(['script', 'style']):
|
|
||||||
element.decompose()
|
|
||||||
|
|
||||||
# Convert to markdown
|
|
||||||
markdown_content = md(str(soup), heading_style="ATX")
|
|
||||||
|
|
||||||
return markdown_content
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def process_archive(
|
|
||||||
url: str,
|
|
||||||
title: Optional[str] = None,
|
|
||||||
encoding: str = 'utf-8',
|
|
||||||
source: Optional[str] = None,
|
|
||||||
) -> Optional[Path]:
|
|
||||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
|
||||||
readable_title = title if title else f"{url} - {timestamp}"
|
|
||||||
|
|
||||||
content = await html_to_markdown(url, source)
|
|
||||||
if content is None:
|
|
||||||
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
|
|
||||||
|
|
||||||
if contains_profanity(url, content, 0.2, Archivist.blacklist):
|
|
||||||
info(f"Not archiving {url} due to profanity")
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
|
|
||||||
except Exception as e:
|
|
||||||
warn(f"Failed to assemble archive path for {url}: {str(e)}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
markdown_content = f"---\n"
|
|
||||||
markdown_content += f"title: \"{readable_title}\"\n"
|
|
||||||
markdown_content += f"added: {timestamp}\n"
|
|
||||||
markdown_content += f"url: \"{url}\"\n"
|
|
||||||
markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
|
|
||||||
markdown_content += f"---\n\n"
|
|
||||||
markdown_content += f"# {readable_title}\n\n"
|
|
||||||
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
|
|
||||||
markdown_content += content
|
|
||||||
|
|
||||||
try:
|
|
||||||
markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
with open(markdown_path, 'w', encoding=encoding) as md_file:
|
|
||||||
md_file.write(markdown_content)
|
|
||||||
debug(f"Successfully saved to {markdown_path}")
|
|
||||||
return markdown_path
|
|
||||||
except Exception as e:
|
|
||||||
warn(f"Failed to write markdown file: {str(e)}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,7 @@ from pathlib import Path
|
||||||
import filetype
|
import filetype
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
from better_profanity import profanity
|
from better_profanity import profanity
|
||||||
|
from adblockparser import AdblockRules
|
||||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||||
import pytesseract
|
import pytesseract
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
|
@ -184,22 +185,48 @@ def f(file):
|
||||||
with open(file_path, 'rb') as thefile:
|
with open(file_path, 'rb') as thefile:
|
||||||
return thefile
|
return thefile
|
||||||
|
|
||||||
|
|
||||||
def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
|
|
||||||
custom_words = custom_words or []
|
|
||||||
if any(word.lower() in url.lower() for word in custom_words):
|
|
||||||
info(f"Blacklisted word in {url}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Check content for profanity
|
def is_ad_or_tracker(url: str, rules: AdblockRules) -> bool:
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
return rules.should_block(url, { 'domain': parsed_url.netloc })
|
||||||
|
|
||||||
|
|
||||||
|
def contains_blacklisted_word(text: str, blacklist: List[str]) -> bool:
|
||||||
|
return any(word.lower() in text.lower() for word in blacklist)
|
||||||
|
|
||||||
|
|
||||||
|
def contains_profanity(content: str, threshold: float = 0.01, custom_words: Optional[List[str]] = None) -> bool:
|
||||||
|
custom_words = custom_words or []
|
||||||
|
|
||||||
|
# Combine the profanity library's word list with custom words
|
||||||
profanity.load_censor_words(custom_words)
|
profanity.load_censor_words(custom_words)
|
||||||
|
|
||||||
word_list = content.split()
|
word_list = content.split()
|
||||||
content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
|
content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
|
||||||
content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
|
content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
|
||||||
debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
|
|
||||||
|
debug(f"Profanity ratio for content: {content_profanity_ratio}")
|
||||||
return content_profanity_ratio >= threshold
|
return content_profanity_ratio >= threshold
|
||||||
|
|
||||||
|
|
||||||
|
def load_filter_lists(blocklists_dir: Path):
|
||||||
|
rules = []
|
||||||
|
for file_path in blocklists_dir.glob('*.txt'):
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
rules.extend(file.read().splitlines())
|
||||||
|
logging.info(f"Loaded blocklist: {file_path.name}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error loading blocklist {file_path.name}: {str(e)}")
|
||||||
|
return rules
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_adblock_rules(blocklists_dir: Path):
|
||||||
|
rules = load_filter_lists(blocklists_dir)
|
||||||
|
logging.info(f"Initialized AdblockRules with {len(rules)} rules")
|
||||||
|
return AdblockRules(rules)
|
||||||
|
|
||||||
|
|
||||||
def get_extension(file):
|
def get_extension(file):
|
||||||
try:
|
try:
|
||||||
if isinstance(file, str):
|
if isinstance(file, str):
|
||||||
|
@ -519,3 +546,31 @@ async def run_ssh_command(server, command):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
err(f"SSH command failed for server {server.id}: {str(e)}")
|
err(f"SSH command failed for server {server.id}: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
|
||||||
|
if source:
|
||||||
|
html_content = source
|
||||||
|
elif url:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
|
html_content = await response.text()
|
||||||
|
else:
|
||||||
|
err(f"Unable to convert nothing to markdown.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Use readability to extract the main content
|
||||||
|
doc = Document(html_content)
|
||||||
|
cleaned_html = doc.summary()
|
||||||
|
|
||||||
|
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
||||||
|
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||||
|
|
||||||
|
# Remove any remaining unwanted elements
|
||||||
|
for element in soup(['script', 'style']):
|
||||||
|
element.decompose()
|
||||||
|
|
||||||
|
# Convert to markdown
|
||||||
|
markdown_content = md(str(soup), heading_style="ATX")
|
||||||
|
|
||||||
|
return markdown_content
|
Loading…
Add table
Reference in a new issue