Auto-update: Mon Aug 5 17:35:13 PDT 2024
This commit is contained in:
parent
ec21f92242
commit
72d3ba27b2
8 changed files with 127741 additions and 94 deletions
|
@ -40,6 +40,7 @@ os.makedirs(ALERTS_DIR, exist_ok=True)
|
|||
REQUESTS_DIR = LOGS_DIR / "requests"
|
||||
os.makedirs(REQUESTS_DIR, exist_ok=True)
|
||||
REQUESTS_LOG_PATH = LOGS_DIR / "requests.log"
|
||||
BLOCKLISTS_DIR = DATA_DIR / "blocklists"
|
||||
|
||||
# LOCATE AND WEATHER LOCALIZATIONS
|
||||
USER_FULLNAME = os.getenv('USER_FULLNAME')
|
||||
|
|
|
@ -16,10 +16,11 @@ PUBLIC:
|
|||
|
||||
TRUSTED_SUBNETS:
|
||||
- 127.0.0.1/32
|
||||
- 10.0.0.0/24
|
||||
- 192.168.0.0/24
|
||||
- 10.13.37.0/24
|
||||
- 100.64.64.0/24
|
||||
|
||||
MODULES:
|
||||
archivist: on
|
||||
asr: on
|
||||
cal: on
|
||||
cf: off
|
||||
|
@ -84,13 +85,15 @@ POOL:
|
|||
conda_env: 'myenv'
|
||||
|
||||
EXTENSIONS:
|
||||
pgp: on
|
||||
archivist: on
|
||||
courtlistener: off
|
||||
macnotify: on
|
||||
shellfish: on
|
||||
|
||||
TZ: "America/Los_Angeles"
|
||||
|
||||
TZ: 'UTC'
|
||||
|
||||
KEYS: ['{{ SECRET.GLOBAL_API_KEYS }}']
|
||||
KEYS: ["{{ SECRET.GLOBAL_API_KEYS }}"]
|
||||
|
||||
GARBAGE:
|
||||
COLLECTION_INTERVAL: 60 * 60
|
||||
|
|
6
sijapi/config/archivist.yaml-example
Normal file
6
sijapi/config/archivist.yaml-example
Normal file
|
@ -0,0 +1,6 @@
|
|||
dir: "~/.private/archive/"
|
||||
blacklist:
|
||||
- "http://10.64.64.10"
|
||||
- "http://10.64.64.11"
|
||||
- "blacklisted_word"
|
||||
- "another_blacklisted_word"
|
74173
sijapi/data/blocklists/easylist.txt
Normal file
74173
sijapi/data/blocklists/easylist.txt
Normal file
File diff suppressed because it is too large
Load diff
53375
sijapi/data/blocklists/easyprivacy.txt
Normal file
53375
sijapi/data/blocklists/easyprivacy.txt
Normal file
File diff suppressed because it is too large
Load diff
113
sijapi/routers/archivist.py
Normal file
113
sijapi/routers/archivist.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
'''
|
||||
Used to archive sites visited with browser via the archivist.js UserScript.
|
||||
'''
|
||||
# routers/archivist.py
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
|
||||
import os
|
||||
import uuid
|
||||
import asyncio
|
||||
import shutil
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime as dt_datetime, timedelta
|
||||
from typing import Optional, List, Tuple
|
||||
import aiohttp
|
||||
import aiofiles
|
||||
import newspaper
|
||||
import trafilatura
|
||||
from adblockparser import AdblockRules
|
||||
from urllib.parse import urlparse
|
||||
import logging
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from newspaper import Article
|
||||
from readability import Document
|
||||
from markdownify import markdownify as md
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
from datetime import datetime as dt_datetime
|
||||
from better_profanity import profanity
|
||||
from sijapi.classes import L, API, Archivist
|
||||
from sijapi.utilities import html_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker, initialize_adblock_rules, contains_blacklisted_word
|
||||
from sijapi import L, Archivist, BLOCKLISTS_DIR, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
|
||||
|
||||
archivist = APIRouter()
|
||||
|
||||
logger = L.get_module_logger("news")
|
||||
def debug(text: str): logger.debug(text)
|
||||
def info(text: str): logger.info(text)
|
||||
def warn(text: str): logger.warning(text)
|
||||
def err(text: str): logger.error(text)
|
||||
def crit(text: str): logger.critical(text)
|
||||
|
||||
adblock_rules = initialize_adblock_rules(BLOCKLISTS_DIR)
|
||||
|
||||
@archivist.post("/archive")
|
||||
async def archive_post(
|
||||
url: Optional[str] = Form(None),
|
||||
source: Optional[str] = Form(None),
|
||||
title: Optional[str] = Form(None),
|
||||
encoding: str = Form('utf-8')
|
||||
):
|
||||
if not url:
|
||||
warn(f"No URL provided to /archive endpoint.")
|
||||
raise HTTPException(status_code=400, detail="URL is required")
|
||||
|
||||
if is_ad_or_tracker(url, adblock_rules):
|
||||
debug(f"Skipping likely ad or tracker URL: {url}")
|
||||
raise HTTPException(status_code=400, detail="URL is likely an ad or tracker")
|
||||
|
||||
markdown_filename = await process_archive(url, title, encoding, source)
|
||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||
|
||||
async def process_archive(
|
||||
url: str,
|
||||
title: Optional[str] = None,
|
||||
encoding: str = 'utf-8',
|
||||
source: Optional[str] = None,
|
||||
) -> Optional[Path]:
|
||||
|
||||
# Check URL against blacklist
|
||||
if contains_blacklisted_word(url, Archivist.blacklist):
|
||||
info(f"Not archiving {url} due to blacklisted word in URL")
|
||||
return None
|
||||
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
readable_title = title if title else f"{url} - {timestamp}"
|
||||
|
||||
content = await html_to_markdown(url, source)
|
||||
if content is None:
|
||||
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
|
||||
|
||||
# Check content for profanity
|
||||
if contains_profanity(content, threshold=0.01, custom_words=Archivist.blacklist):
|
||||
info(f"Not archiving {url} due to profanity in content")
|
||||
return None
|
||||
|
||||
try:
|
||||
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
|
||||
except Exception as e:
|
||||
warn(f"Failed to assemble archive path for {url}: {str(e)}")
|
||||
return None
|
||||
|
||||
markdown_content = f"---\n"
|
||||
markdown_content += f"title: \"{readable_title}\"\n"
|
||||
markdown_content += f"added: {timestamp}\n"
|
||||
markdown_content += f"url: \"{url}\"\n"
|
||||
markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
|
||||
markdown_content += f"---\n\n"
|
||||
markdown_content += f"# {readable_title}\n\n"
|
||||
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
|
||||
markdown_content += content
|
||||
|
||||
try:
|
||||
markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(markdown_path, 'w', encoding=encoding) as md_file:
|
||||
md_file.write(markdown_content)
|
||||
debug(f"Successfully saved to {markdown_path}")
|
||||
return markdown_path
|
||||
except Exception as e:
|
||||
warn(f"Failed to write markdown file: {str(e)}")
|
||||
return None
|
|
@ -25,7 +25,7 @@ from urllib3.util.retry import Retry
|
|||
from fastapi import APIRouter, BackgroundTasks, UploadFile, Form, HTTPException, Query, Path as FastAPIPath
|
||||
from pathlib import Path
|
||||
from sijapi import L, News, Archivist, OBSIDIAN_VAULT_DIR, OBSIDIAN_RESOURCES_DIR, DEFAULT_11L_VOICE, DEFAULT_VOICE
|
||||
from sijapi.utilities import sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity
|
||||
from sijapi.utilities import htmp_to_markdown, sanitize_filename, assemble_journal_path, assemble_archive_path, contains_profanity, is_ad_or_tracker
|
||||
from sijapi.routers import gis, llm, tts, note
|
||||
|
||||
news = APIRouter()
|
||||
|
@ -179,6 +179,7 @@ async def download_and_save_article(article, site_name, earliest_date, bg_tasks:
|
|||
err(f"Error processing article from {article.url}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
async def process_news_site(site, bg_tasks: BackgroundTasks):
|
||||
info(f"Downloading articles from {site.name}...")
|
||||
|
||||
|
@ -251,15 +252,6 @@ async def clip_get(
|
|||
|
||||
|
||||
|
||||
@news.post("/archive")
|
||||
async def archive_post(
|
||||
url: Optional[str] = Form(None),
|
||||
source: Optional[str] = Form(None),
|
||||
title: Optional[str] = Form(None),
|
||||
encoding: str = Form('utf-8')
|
||||
):
|
||||
markdown_filename = await process_archive(url, title, encoding, source)
|
||||
return {"message": "Clip saved successfully", "markdown_filename": markdown_filename}
|
||||
|
||||
|
||||
async def parse_article(url: str, source: Optional[str] = None) -> Article:
|
||||
|
@ -302,77 +294,6 @@ async def parse_article(url: str, source: Optional[str] = None) -> Article:
|
|||
return article
|
||||
|
||||
|
||||
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
|
||||
if source:
|
||||
html_content = source
|
||||
elif url:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
html_content = await response.text()
|
||||
else:
|
||||
err(f"Unable to convert nothing to markdown.")
|
||||
return None
|
||||
|
||||
# Use readability to extract the main content
|
||||
doc = Document(html_content)
|
||||
cleaned_html = doc.summary()
|
||||
|
||||
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
||||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||
|
||||
# Remove any remaining unwanted elements
|
||||
for element in soup(['script', 'style']):
|
||||
element.decompose()
|
||||
|
||||
# Convert to markdown
|
||||
markdown_content = md(str(soup), heading_style="ATX")
|
||||
|
||||
return markdown_content
|
||||
|
||||
|
||||
|
||||
async def process_archive(
|
||||
url: str,
|
||||
title: Optional[str] = None,
|
||||
encoding: str = 'utf-8',
|
||||
source: Optional[str] = None,
|
||||
) -> Optional[Path]:
|
||||
timestamp = dt_datetime.now().strftime('%b %d, %Y at %H:%M')
|
||||
readable_title = title if title else f"{url} - {timestamp}"
|
||||
|
||||
content = await html_to_markdown(url, source)
|
||||
if content is None:
|
||||
raise HTTPException(status_code=400, detail="Failed to convert content to markdown")
|
||||
|
||||
if contains_profanity(url, content, 0.2, Archivist.blacklist):
|
||||
info(f"Not archiving {url} due to profanity")
|
||||
return None
|
||||
|
||||
try:
|
||||
markdown_path, relative_path = assemble_archive_path(filename=readable_title, extension=".md")
|
||||
except Exception as e:
|
||||
warn(f"Failed to assemble archive path for {url}: {str(e)}")
|
||||
return None
|
||||
|
||||
markdown_content = f"---\n"
|
||||
markdown_content += f"title: \"{readable_title}\"\n"
|
||||
markdown_content += f"added: {timestamp}\n"
|
||||
markdown_content += f"url: \"{url}\"\n"
|
||||
markdown_content += f"date: \"{dt_datetime.now().strftime('%Y-%m-%d')}\"\n"
|
||||
markdown_content += f"---\n\n"
|
||||
markdown_content += f"# {readable_title}\n\n"
|
||||
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}\n\n"
|
||||
markdown_content += content
|
||||
|
||||
try:
|
||||
markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(markdown_path, 'w', encoding=encoding) as md_file:
|
||||
md_file.write(markdown_content)
|
||||
debug(f"Successfully saved to {markdown_path}")
|
||||
return markdown_path
|
||||
except Exception as e:
|
||||
warn(f"Failed to write markdown file: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ from pathlib import Path
|
|||
import filetype
|
||||
from PyPDF2 import PdfReader
|
||||
from better_profanity import profanity
|
||||
from adblockparser import AdblockRules
|
||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_path
|
||||
|
@ -184,22 +185,48 @@ def f(file):
|
|||
with open(file_path, 'rb') as thefile:
|
||||
return thefile
|
||||
|
||||
|
||||
def contains_profanity(url: str, content: str, threshold: float = 0.2, custom_words: Optional[List[str]] = None) -> bool:
|
||||
custom_words = custom_words or []
|
||||
if any(word.lower() in url.lower() for word in custom_words):
|
||||
info(f"Blacklisted word in {url}")
|
||||
return True
|
||||
|
||||
# Check content for profanity
|
||||
def is_ad_or_tracker(url: str, rules: AdblockRules) -> bool:
|
||||
parsed_url = urlparse(url)
|
||||
return rules.should_block(url, { 'domain': parsed_url.netloc })
|
||||
|
||||
|
||||
def contains_blacklisted_word(text: str, blacklist: List[str]) -> bool:
|
||||
return any(word.lower() in text.lower() for word in blacklist)
|
||||
|
||||
|
||||
def contains_profanity(content: str, threshold: float = 0.01, custom_words: Optional[List[str]] = None) -> bool:
|
||||
custom_words = custom_words or []
|
||||
|
||||
# Combine the profanity library's word list with custom words
|
||||
profanity.load_censor_words(custom_words)
|
||||
|
||||
word_list = content.split()
|
||||
content_profanity_count = sum(1 for word in word_list if profanity.contains_profanity(word))
|
||||
content_profanity_ratio = content_profanity_count / len(word_list) if word_list else 0
|
||||
debug(f"Profanity ratio for {url}: {content_profanity_ratio}")
|
||||
|
||||
debug(f"Profanity ratio for content: {content_profanity_ratio}")
|
||||
return content_profanity_ratio >= threshold
|
||||
|
||||
|
||||
def load_filter_lists(blocklists_dir: Path):
|
||||
rules = []
|
||||
for file_path in blocklists_dir.glob('*.txt'):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
rules.extend(file.read().splitlines())
|
||||
logging.info(f"Loaded blocklist: {file_path.name}")
|
||||
except Exception as e:
|
||||
logging.error(f"Error loading blocklist {file_path.name}: {str(e)}")
|
||||
return rules
|
||||
|
||||
|
||||
def initialize_adblock_rules(blocklists_dir: Path):
|
||||
rules = load_filter_lists(blocklists_dir)
|
||||
logging.info(f"Initialized AdblockRules with {len(rules)} rules")
|
||||
return AdblockRules(rules)
|
||||
|
||||
|
||||
def get_extension(file):
|
||||
try:
|
||||
if isinstance(file, str):
|
||||
|
@ -519,3 +546,31 @@ async def run_ssh_command(server, command):
|
|||
except Exception as e:
|
||||
err(f"SSH command failed for server {server.id}: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
|
||||
if source:
|
||||
html_content = source
|
||||
elif url:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
html_content = await response.text()
|
||||
else:
|
||||
err(f"Unable to convert nothing to markdown.")
|
||||
return None
|
||||
|
||||
# Use readability to extract the main content
|
||||
doc = Document(html_content)
|
||||
cleaned_html = doc.summary()
|
||||
|
||||
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
||||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||
|
||||
# Remove any remaining unwanted elements
|
||||
for element in soup(['script', 'style']):
|
||||
element.decompose()
|
||||
|
||||
# Convert to markdown
|
||||
markdown_content = md(str(soup), heading_style="ATX")
|
||||
|
||||
return markdown_content
|
Loading…
Reference in a new issue