commit e669ead3af4db462fa146efd46fbf1a7989140be Author: sanj <67624670+iodrift@users.noreply.github.com> Date: Sun Jun 23 00:24:48 2024 -0700 Initial commit (excluding testbed files) diff --git a/README.md b/README.md new file mode 100644 index 0000000..181cebf --- /dev/null +++ b/README.md @@ -0,0 +1,481 @@ +``` +#────────────────────────────────────────────────────────────────────────────────── +# C O N F I G U R A T I O N F I L E +#────────────────────────────────────────────────────────────────────────────────── +# +# Hi friend! You've found my hidden .config.YAML-example file. Do you like +# old-school ASCII art? I bet you do. So listen, this'll be your method for +# configuring sijapi, and nothing works until you at least: +# +# (1) fill in the ESSENTIALS category, and +# +# (2) rename this file `.config.yaml` +# +# ... and even then, certain features will not work until you set other +# relevant variables below. +# +# So get yourself a beverage, put on some sick beats, and settle in for a vibe-y +# configuration sesh. Remember to read my detailed notes if you ever feel lost, +# and most important, remember: +# +# you are NOT alone, +# I love you SO much, +# and you are SO worthy. <3 +# +# y o u r b f & b f 4 e , † +# .x+=:. . . +# z` ^% @88> .. @88> +# . .d`` %8P +# .@8Ned8" . "8P u @8Ne. .u . +# .@^%8888" .@88u . us888u. %8888:u@88N .@88u +# x88: `)8b. ''888E` u888u. .@88 "8888" `888I 888. ''888E` +# ~ 8888N=*8888 888E `'888E 9888 9888 888I 888I 888E +# %8" R88 888E 888E 9888 9888 888I 888I 888E +# @8Wou 9% 888E 888E 9888 9888 uW888L 888' 888E +# .888888P` 888& 888E 9888 9888 '*88888Nu88P 888& +# ` ^"F R888" 888E "888*""888" ~ '88888F` R888" +# "" 888E ^Y" ^Y' 888 ^ "" +# 888E *8E +# 888P '8> † biggest fan +# .J88" " " and best +# friend 4 e v e r +# +# B U T I H E A R Y O U : +# L E T ' S T A K E I T S L O W A N D +# ────────────── S T A R T W I T H T H E ────────────── +# +# ███████╗███████╗███████╗███████╗███╗ ██╗████████╗██╗ █████╗ ██╗ ███████╗ +# ██╔════╝██╔════╝██╔════╝██╔════╝████╗ ██║╚══██╔══╝██║██╔══██╗██║ ██╔════╝ +# █████╗ ███████╗███████╗█████╗ ██╔██╗ ██║ ██║ ██║███████║██║ ███████╗ +# ██╔══╝ ╚════██║╚════██║██╔══╝ ██║╚██╗██║ ██║ ██║██╔══██║██║ ╚════██║ +# ███████╗███████║███████║███████╗██║ ╚████║ ██║ ██║██║ ██║███████╗███████║ +# ╚══════╝╚══════╝╚══════╝╚══════╝╚═╝ ╚═══╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚══════╝ +# ───────────────────────────────────────────────────────────────── +# +#─── first, bind an ip address and port : ────────────────────────────────────────── +HOST_NET=0.0.0.0 +HOST_PORT=4444 +BASE_URL=https://api.sij.ai + +#─── notes: ────────────────────────────────────────────────────────────────────── +# +# HOST_NET† and HOST_PORT comprise HOST and determine the ip and port the server binds to. +# BASE_URL is used to assemble URLs, e.g. in the MS authentication flow and for serving images generated on the sd router. +# BASE_URL should match the base URL used to access sijapi sans endpoint, e.g. http://localhost:4444 or https://api.sij.ai +# +# † Take care here! Please ensure you understand the implications of setting HOST_NET to anything besides 127.0.0.1, and configure your firewall and router appropriately if you do. Setting HOST_NET to 0.0.0.0, for instance, opens sijapi to any device the server running it is accessible to — including potentially frightening internet randos (depending how your firewall, router, and NAT are configured). +# +# Here are a few options to consider to more securely enable access from +# other devices: +# +# (1) if all access can occur over Tailscale, either: +# (a) leave HOST_NET set to 127.0.0.1, run `tailscale cert $(tailscale +# whois $(tailscale ip | head -n 1) | awk '/Name:/ {print $2}') +# if you haven't already issued yourself a TLS certificate on +# Tailscale, and then run `tailscale serve --bg --https=4443 +# 4444` to expose sijapi to your other tailscale-enabled devices +# at `https://{device.magicdns-domain.net:4443`}; or +# (b) set HOST_NET to your server's Tailscale IP (this should work +# but for me doesn't reliably) +# +# (2) if WAN access truly is required, leave HOST_NET set to 127.0.0.1 and +# configure either: +# (a) a Cloudflare tunnel, or +# (b) a reverse proxy with HTTPS (Caddy is excellent for this). +# +# And please be sure to set a strong API key either way but especially for (2). +# ────────── +# +#──── configure API key authorization and select exemptions──────────────────────── +GLOBAL_API_KEY=sk-NhrtQwCHNdK5sRZC +PUBLIC_SERVICES=/id,/ip,/health,/img/,/cl/dockets,/cl/search,/cd/alert +TRUSTED_SUBNETS=127.0.0.1/32,10.13.37.0/24,100.64.64.0/24 +#─── notes: ────────────────────────────────────────────────────────────────────── +# +# GLOBAL_API_KEY determines the API key that will be required to access all endpoints, except access to PUBLIC_SERVICES or from TRUSTED_SUBNETS. Authentication is made via an `Authorization: Bearer {GLOBAL_API_KEY}` header. +# TRUSTED_SUBNETS might commonly include 127.0.0.1/32 (localhost), 100.x.x.0/24 (Tailscale tailnet), and/or 192.168.x.0/24 or 10.x.x.0/24 (local network). +# When configuring a reverse proxy or Cloudflare tunnel, please verify traffic through it does not appear to sijapi (i.e. in ./logs) as though it were coming from any of the subnets specified here. For sij, using Caddy, it does not, but your setup may differ. +# ────────── +# +#─── router selection: ──────────────────────────────────────────────────────────── +ROUTERS=asr,llm,health,hooks,locate,note,sd,serve,summarize,time,tts,weather +UNLOADED=auth,calendar,cf,email,ig +#─── notes: ────────────────────────────────────────────────────────────────────── +# +# ROUTERS determines which routers are loaded.† +# UNLOADED is not used directly -- it's just there to help keep track which routers are disabled. +# +# † ┓ ┏ orth bearing in mind: some routers inherently rely on other routers, +# ┃┃┃ 3rd party APIs, or other apps being installed locally. If a router is +# ┗┻┛ set to load (i.e. is included in ROUTERS) depends on another router, +# that other router will also load too irrespective of whether it's listed. +# +# But let's get down to brass tacks, shall we? +# +# asr: requires faster_whisper — $ pip install faster_whisper — and +# downloading the model file specified in ASR_DEFAULT_MODEL. +# +# auth: authenticates a Microsoft 365 account (for email & calendar). +# +# calendar: requires (1) a Microsoft 365 account with a properly configured +# Azure Active Directory app, and/or (2) Calendars on macOS. +# +# cf: interfaces with the Cloudflare API and Caddy to register new +# [sub-]domains on Cloudflare and deploy them with Caddy as +# reverse proxy. +# +# llm: requires ollama — $ pip install ollama — and downloading the +# models set in LLM_DEFAULT_MODEL and LLM_VISION_MODEL. +# +# email: designed for accessing Protonmail via Protonmail Bridge and/or +# Microsoft 365, but should work with any IMAP/SMTP email account. +# +# hooks: designed for two specific use cases: monitoring court dockets +# through CourtListener.org, and monitoring arbitrary web pages for +# changes in tandem with a self-hosted changedetection.io instance. +# Both require accounts; other functionality would require +# additional / modified code. +# +# ig: requires an Instagram account, with credentials and other settings +# configured separately in the ig_config.json file; relies heavily +# on the llm and sd routers which have their own dependencies. +# +# locate: some endpoints work as is, but the core location tracking +# functionality requires Postgresql + PostGIS extension and are +# designed specifically to pair with a mobile device where +# Pythonista is installed and configured to run the +# `gps_tracker.py` and `gps_upload.py` scripts periodically or per +# repeating conditionwy (e.g. via automation under Apple Shortcuts). +# +# note: designed for use with Obsidian plus the Daily Notes and Tasks +# core extensions; and the Admonitions, Banners, Icons (with the +# Lucide pack), and Make.md community extensions. Moreover `notes` +# relies heavily on the calendar, llm, locate, sd, summarize, time, +# tts, and weather routers and accordingly on the external +# dependencies of each. +# +# sd: requires ComfyUI plus any modules and StableDiffusion models +# set in sd_config and individual workflow .json files. +# +# summarize: relies on the llm router and thus requires ollama. +# +# time: requires the subscription-based macOS app 'Timing' (one of many +# apps that together make SetApp an incredible value for macOS users!) +# +# tts: designed for use with coqui — $ pip install coqui — and/or the +# ElevenLabs API. +# +# weather: requires a VisualCrossing API key and is designed for (but doesn't +# itself strictly require) Postgresql with the PostGIS extension; +# (... but it presently relies on the locate router, which does). +# +# +# ... Whew! that was a lot, right? I'm so glad we're in this together... +# ────────── +# +#─────────────────────── Y O U ' R E G O N N A L O V E ──────────────────────── +# +# ░ ░░ ░░ ░ ░░░░░░░░ ░░░ ░░░ ░░ ░░░░░░░ ░ +# ▒▒▒▒ ▒▒▒▒ ▒▒▒▒ ▒▒▒▒ ▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ +# ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓▓▓▓▓ ▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓ ▓▓▓▓▓▓▓ ▓▓▓ +# ████ ████ ████ ████ █████████████ █ ████ █ █ ███████ ███████ +# ████ ████ ████ █ █ ██ ███ ██ ████ █ █ █ +# +# A N D I ' M N O T. E V E N. J E A L O U S. +# Y O U D E S E R V E I T A L L , B A B Y C A K E S. +# +#─── use tailscale for secure remote access: ─────────────────────────────────────── +TS_IP=100.64.64.20 +TS_SUBNET=100.64.64.0/24 +TS_ID=sij-mbp16 +TS_TAILNET=starling-sailfin +TAILSCALE_API_KEY=tskey-api-kosR4MfJtF11CNTRL-zJu4odnpr4huLwAGsuy54hvkJi2ScVWQL +#─── notes: ────────────────────────────────────────────────────────────────────── +# +# TS_IP should match the Tailscale IP of the device. But this is deprecated, and if the functionality becomes relevant again, it should be come back in the form of a dynamic check (`tailscale status` in a shell subprocess) in __init__.py or even the /id endpoint. +# TS_SUBNET should match the IP/CIDR-format tailnet +# TS_ID currently has two roles: it's used to assemble the complete MagicDNS of the server, and it determines what the /id endpoint on the health router returns. This is relevant where multiple servers run the script behind a load balancer (e.g. Caddy), as a means to check which server responds. Bear in mind that /id is NOT API key-protected by default here. +# TS_TAILNET should match the tailnet's MagicDNS domain (omitting the `.net`, for reasons) +# ────────── +# +#────────────── U & M E ── W E C A N G E T T H R O U G H ────────────────── +# +# ██▓███ ▒█████ ██████ ▄▄▄█████▓ ▄████ ██▀███ ▓█████ ██████ +# ▓██░ ██▒██▒ ██▒▒██ ▒ ▓ ██▒ ▓▒ ██▒ ▀█▒▓██ ▒ ██▒▓█ ▀ ▒██ ▒ +# ▓██░ ██▓▒██░ ██▒░ ▓██▄ ▒ ▓██░ ▒░▒██░▄▄▄░▓██ ░▄█ ▒▒███ ░ ▓██▄ +# ▒██▄█▓▒ ▒██ ██░ ▒ ██▒░ ▓██▓ ░ ░▓█ ██▓▒██▀▀█▄ ▒▓█ ▄ ▒ ██▒ +# ▒██▒ ░ ░ ████▓▒░▒██████▒▒ ▒██▒ ░ ░▒▓███▀▒░██▓ ▒██▒░▒████▒▒██████▒▒ +# ▒██▒ ░ ░ ▒░▒░▒░ ▒ ▒▓▒ ▒ ░ ▒ ░░ ░▒ ▒ ░ ▒▓ ░▒▓░░░ ▒░ ░▒ ▒▓▒ ▒ ░ +# ▒▓▒░ ░ ▒ ▒░ ░ ░▒ ░ ░ ░ ░ ░ ░▒ ░ ▒░ ░ ░ ░░ ░▒ ░ ░ +# ░▒ ░ ░ ░ ▒ ░ ░ ░ ░ ░ ░ ░ ░░ ░ ░ ░ ░ ░ +# ░░ ░ ░ T O G E T H ░ R . ░ ░ ░ ░ ░ +# ░ +#─── for weather and locate modules: ───────────── J U S T H O L D M Y H A N D . +DB=sij +# R E A L T I G H T. +DB_HOST=127.0.0.1 +DB_PORT=5432 +# U G O T T H I S , K ? +DB_USER=sij +DB_PASS='Synchr0!' +# Y E A H . . . +DB_SSH=100.64.64.15 +# * J U S T L I K E T H A T . * +DB_SSH_USER=sij +DB_SSH_PASS='Synchr0!' +#─── notes: ────────────────────────────────────────────────── S E E ? E Z - P Z +# +# DB, DB_HOST, DB_PORT, DB_USER, and DB_PASS should specify those respective +# credentials for your Postgres database. DB_SSH and associated _USER and _PASS +# variables allow database access over an SSH tunnel. +# +# In the current implementation, we rely on Postgres to hold: +# i. user-logged location data (locate module), and +# ii. results from past weather forecast checks (weather module). +# +# A future version will hopefully make use of PostGIS's geocoding capabilities, +# and add a vector database for the LLM module. Until then it's up to you if the +# locate and weather modules are worth the hassle of maintaining Postgres. +# ────────── +# +# +#───── Y O U C A N S I T T H I S O N E) O U T B A B E , ────────<3───────── +# ( ( ( I F Y O U ' D ) +# ))\( ( /(( L I K E . . . ( ( +# ( (()/(( /((_)\ )\())),----,. +# )\((__ ))\( ()) |__))((_)- ))((,' ,' +# ,' , `. /((_)\(_) / / '. |(_)|_ ,' .' +# ,-+-,.' _ | / / '. / ../ ; ,---. ,----.' .' +# ,-+-. ; , || | : /`. / \ ``\ .`- ' / \ | | .' +# ,--.'|' | ;|; | |--` \___\/ \ : / / ' : : |--, +# | | ,', | ':| : ;_ \ : | . ' / : | ;.' \ +# | | / | | || \ \ `. / / / ' / ; | | | +# ' | : | : |, `----. \ ) \ \ \ | : \ `----'.'\ ; +# ; . | ; |--' )(__ \ \ | ((__ / : |; | ``. __ \ . | +# | : | | , / /`--' / /)\(/\ / :' ; \ / /\/ / : +# | : ' |/ '--'. / / ,,/ ',- .' | .\ |/ ,,/ ',- . +# ; | |`-' `--'---' \ ''\ ; | : '; :\ ''\ ; +# | ;/ O R , Y U P , \ \ .' \ \ / \ \ .' +# '---'B U R N I T A L L D O W N.-`-,,-' `---`--` `--`-,-' +# Y O U H A V E A +# G O D D E S S O F D E S T R U C T I O N W I T H I N , +# A N D T H A T I S S O V A L I D !! +#─── ms365 (calendars): ────────────────────────────────────────────────────────────── +MS365_TOGGLE=False +ICAL_TOGGLE=True +ICALENDARS=3CCC9C7B-BFF0-4850-9CE9-BC504859CBC6,E68FE085-2ECA-4097-AF0A-8D38C404D8DA,AB5A0473-16DD-4916-BD6D-6CB916726605∑∑ +MS365_CLIENT_ID=ce8cbd24-f146-4dc7-8ee7-51d9b69dec59 +MS365_TENANT_ID=bad78048-a6e0-47b1-a24b-403c444aa349 +MS365_SECRET=gbw8Q~7U90GMdvneNnPnzAUt5nWVJPbOsagLPbMe +MS365_THUMBPRINT=4CD86699A8B675411EE9C971CB2783E11F9E52CB +MS365_SCOPE=basic,calendar_all +MS365_TOKEN_FILE=oauth_token.txt +#─── notes: ────────────────────────────────────────────────────────────────────────────── +# +# # MS365_CLIENT_ID, _TENANT_ID, _SECRET, AND _SCOPES must be obtained from Microsoft +# via the Azure portal, by creating a new app registration and an accompanying secret. +# MS365_THUMBPRINT is vestige of an earlier failed attempt to get this working, and +# for now is deprecated. I recommend seeking out a well-reviewed tutorial for +# creating an app on Azure with a client_id and secret and necessary scopes for +# individual calendar access, because I had one heck of a time trying various approaches. +# Do better, Microsoft. +# ────────── +# +#──────────────────────────────── I B E T Y O U ────────────────────────────────── +# R E C E I V E A L O T O F L O V E L E T T E R S O V E R +# +# .----------------. .----------------. .----------------. .----------------. +# | .--------------. | .--------------. | .--------------. | .--------------. | +# | | _____ | | | ____ ____ | | | __ | | | ______ | | +# | | |_ _| | | ||_ \ / _|| | | / \ | | | |_ __ \ | | +# | | | | | | | | \/ | | | | / /\ \ | | | | |__) | | | +# | | | | | | | | |\ /| | | | | / ____ \ | | | | ___/ | | +# | | _| |_ | | | _| |_\/_| |_ | | | _/ / \ \_ | | | _| |_ | | +# | | |_____| | | ||_____||_____|| | ||____| |____|| | | |_____| | | +# | | | | | | | | | | | | | +# | '--------------' | '--------------' | '--------------' | '--------------' | +# '----------------' '----------------' '----------------' '----------------' +# +# E M A I L +# +#─── imap & smtp: ──────────────────────────────────────────────────────────────────────── +IMAP_HOST=127.0.0.1 +EMAIL_ADDRESS='sij@sij.law' +EMAIL_PASSWORD='hesSw7Kum16z-_yxI4kfXQ' +IMAP_PORT=1143 +IMAP_ENCRYPTION=STARTTLS +SMTP_PORT=1025 +SMTP_ENCRYPTION=SSL +#─── notes: ─────────────────────────────────────────────────────────────────────────────── +# +# This is primarily for summarizing incoming emails. Any IMAP account should work, but +# I focused testing on a somewhat complex setup involving Protonmail Bridge. +# ────────── +# +#──────────────────────────────── G E T S I L L Y ──────────────────────────────────── +# T H E N G O B O N K E R S +# W I T H Y O U R O W N +# +# ░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓██████▒▓██████▒░ +# ░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░░▒▓█▓▒░ +# ░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░░▒▓█▓▒░ +# ░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░░▒▓█▓▒░ +# ░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░░▒▓█▓▒░ +# ░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░░▒▓█▓▒░ +# ░▒▓████████▓▒ ░▒▓████████▓▒ ░▒▓█▓▒░░▒▓█▓▒░░▒▓█▓▒░ +# +# ( F O R R E A L T H O U G H — T H E S E +#─── via comfyui (stable diffusion): A R E S O H O T R I G H T N O W ) +LLM_URL=http://localhost:11434 +SYSTEM_MSG=You are a helpful AI assistant. +DEFAULT_LLM=dolphin-mistral +DEFAULT_VISION=llava-llama3 +SUMMARY_MODEL=dolphin-mistral +SUMMARY_CHUNK_SIZE=4000 +SUMMARY_CHUNK_OVERLAP=100 +SUMMARY_TPW=1.3 +SUMMARY_LENGTH_RATIO=4 +SUMMARY_MIN_LENGTH=150 +SUMMARY_TOKEN_LIMIT=4096 +SUMMARY_INSTRUCT='You are an AI assistant that provides accurate summaries of text -- nothing more and nothing less. You must not include ANY extraneous text other than the sumary. Do not include comments apart from the summary, do not preface the summary, and do not provide any form of postscript. Do not add paragraph breaks. Do not add any kind of formatting. Your response should begin with, consist of, and end with an accurate plaintext summary.' +SUMMARY_INSTRUCT_TTS='You are an AI assistant that provides email summaries for Sanjay -- nothing more and nothing less. You must not include ANY extraneous text other than the sumary. Do not include comments apart from the summary, do not preface the summary, and do not provide any form of postscript. Do not add paragraph breaks. Do not add any kind of formatting. Your response should begin with, consist of, and end with an accurate plaintext summary. Your response will undergo Text-To-Speech conversion and added to Sanjays private podcast. Providing adequate context (Sanjay did not send this question to you, he will only hear your response) but aiming for conciseness and precision, and bearing in mind the Text-To-Speech conversion (avoiding acronyms and formalities), summarize the following.' +DEFAULT_VOICE=Luna +#─── notes: ────────────────────────────────────────────────────────────────────────────── +# +# The exact values here will depend on what software you are using to inference an LLM, +# and of course what models and capabilities are available through it. The script was +# designed for use with `ollama`, but most of the functionality should be equal with +# LM Studio, LocalAI, ect... +# +# DEFAULT_LLM is self-explanatory; DEFAULT_VISION is used for image recognition within +# a multimodal chat context, such as on the ig module for generating intelligible +# comments to Instagram posts, or more realistic captions for sd-generated images. +# +# Note it's possible to specify a separate model for general purposes and for +# summarization tasks. The other SUMMARY_ variables call for some explanation, +# in particular six that are most relevant when summarizing very long documents: +# +# SUMMARY_CHUNK_SIZE: determines the maximum length, in tokens, the pieces that are +# split and sent individually to the model. +# +# SUMMARY_CHUNK_OVERLAP: determines how much of each chunk is overlapped with the prior +# and next chunks. Set too high causes repetition, set too low +# causes misunderstood confusion and poor summary results. +# The summarization algorithm is flawed but I've gotten the best +# results with this set around 100–200. +# +# SUMMARY_TPW: used in estimating the token count of a prompt for purposes of +# complying with the maximum tokens a model can handle at once. +# Best you can do is estimate. I tend to use long words a fair +# excessively and found my average was 1.3 tokens per word. YMMV. +# +# SUMMARY_LENGTH_RATIO: this is the primary control over the length of generated +# summaries, expressed as the ratio of original text length to +# summary length. The default, 4, means the summaries will be +# around 1/4 the length of the original text you provide it. +# +# SUMMARY_MIN_LENGTH: the default SUMMARY_LENGTH_RATIO of 4 isn't ideal for very +# short texts, but setting it any lower sacrifices conciseness +# in summaries of longer texts. In short one size doesn't fit +# all. The compromise I landed on was to set a "maximum minimum" +# summary length: under no circumstances will the script impose +# a smaller maximum length than this value. +# +# SUMMARY_INSTRUCT: sets the prompt used when summarizing text. +# +# SUMMARY_INSTRUCT_TTS: sets a separate prompt for use when summarizing text where +# tts output was requested; tends to yield "cleaner" audio +# with less numbers (page numbers, citations) and other +# information extraneous to spoken contexts. +# +# DEFAULT_VOICE: used for all tts tasks when a specific voice is not requested. +# ────────── +# +#────,-_/────────── W E C A N E X P E R I M E N T W I T H ──────────.─────────── +# ' | ,~-,-. ,-. ,-. ,--. | --' ,--. ,-. ,--. ,-. ,-. |-- . ,-. ,-. +# .^ | | | | ,--| | | | --' | -,- | --' | | | --' | ,--| | | | | | | +# `--' ' ' ' `-^ `-| `--' `---| `--' ' ' `--' ' `--^ `' ` `-' ' ' +# , | ,-. | ~ I N T H E N U D E . ~ +# `~~' `-+' +# O R F U L L Y C L O T H E D ── U P T O Y O U +# +#─── via comfyui (stable diffusion): ───── ( B U T L E T M E K N O W , Y E A H ? ) +COMFYUI_URL=http://localhost:8188 +COMFYUI_DIR=/Users/sij/workshop/sd/ComfyUI +PHOTOPRISM_USER=NOT_IMPLEMENTED +PHOTOPRISM_PASS=NOT_IMPLEMENTED +#─── notes: ────────────────────────────────────────────────────────────────────────────── +# +# COMFY_URL, as you may expect, should point to the URL you use to access ComfyUI. If you +# don't know, watch for it in the server logs once ComfyUI is fully launched. +# +# COMFYUI_DIR, with similar self-evidence, should point to the base directory of your +# ComfyUI installation (i.e. the folder that contains `models`, `inputs`, and `outputs`) +# +# PhotoPrism integration is not yet implemented, so don't bother with that just yet. +# ────────── +# +# D O N ' T M I S S O N E ─────────────────────────────────────── +#\ F I N A L S M A T T E R I N G O F M I S C E L L A N Y \ +# \ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\ +# \ _ _ _/\\\\_ _ _ _ _ _ /\\\\ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\ +# \ _ _ \/\\\\\\_ _ _ _ /\\\\\\ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\ +# \ _ _ \/\\\//\\\_ _ /\\\//\\\ _ _/\\\ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\ +# \ _ _ \/\\\\///\\\/\\\/ \/\\\ _ _///_ _ _/\\\\\\\\\\_ _ _ _/\\\\\\\\_ _\ +# \ _ _ \/\\\ _\///\\\/ _ \/\\\ _ _/\\\ _ \/\\\////// _ _ _/\\\////// _ _\ +# \ _ _ \/\\\ _ _\/// _ _ \/\\\ _ _/\\\ _ \/\\\\\\\\\\_ _ /\\\_ _ _ _ _ _\ +# \ _ _ \/\\\ _ _ _ _ _ _ \/\\\ _ _/\\\ _ \////////\\\_ _\//\\\ _ _ _ _ _\ +# \ _ _ \/\\\ _ _ _ _ _ _ \/\\\ _ _/\\\ _ _/\\\\\\\\\\_ _ \///\\\\\\\\_ _\ +# \ _ _ \///_ _ _ _ _ _ _ \///_ _ _///_ _ \////////// _ _ _ \//////// _ _\ +# \ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\ +# ─────────────────── A N D O T H E R W H A T - H A V E - Y O U S ── +# +#─── other needful API keys, mainly: ──────────────────────────────────────────────────── +CF_TOKEN=ESjjVFHXfe6NrBo5TrN4_AfhHNezytCVlY-VS2HD +VISUALCROSSING_API_KEY=DAEJSKWJQ2CHM3J6B7C5FWQZV +ELEVENLABS_API_KEY=01eeafb6ce0f6d1fd70e4aa9e7262827 +COURTLISTENER_BASE_URL=https://www.courtlistener.com +COURTLISTENER_API_KEY=your_courtlistener_api_key_here +TIMING_API_URL=https://web.timingapp.com/api/v1 +TIMING_API_KEY=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiIxIiwianRpIjoiODMyNTMwYTAxYjJhNzdlOTgzZWRlMjRiNDdkMmY0YWYzYWU3YTIzYjkxM2QyNTFmZjE1YTQ4YTkzYjI3YTQ4M2I0MzE5MzU2MzZlMjYyZWYiLCJpYXQiOjE2OTI0MjEyNjkuMDc4MzU4LCJuYmYiOjE2OTI0MjEyNjkuMDc4MzYsImV4cCI6MjAwODA0MDQ2OS4wNzMzMjcsInN1YiI6IjM2MTgxNzA4NDcyMTEwMzg5NzYiLCJzY29wZXMiOltdfQ.fVhhJtYb6wtHBQj7J9sxTsT3nb6_BLu4ynqNMC-SpJ2exj31wF7dHXfdGF-ZCg_H2TWh8Jsrak7ovwHsMPvkLRPgxkyjkyLgVbnzZICbP0xffrsguTnillXKCbEkwOVo4s7esf829VVagHCkpNwYbfKLJ9FLHIqs0hQMhh_S7jpbzmXUe7t6tUG43IgILBD0IwphJ2BGs5X2fhjW8FkCke85JxbQ4a29cqYtgFJ7tMP97noTFB4e_gxFpHUl-Ou_bwdpBKfarTyxhtwm1DJkQB_MrAX4py8tmFlFFJPd-7WG-LaswiI7bctN3Lux-If5oxAhm29PkS3ooxvJD86YDR0rJ94aGc8IBypnqYyGFW1ks5fsQ057UwC3XK62ezilWdamh7gtcInShSdHr7pPtIxntCe3x039NSVTBIQ54WHNaWrfI0i83Lm61ak7Ss3qotJhwtIv0aUOUKS3DOz7jfL4Z4GHUjXgBmubeC_vuLHUVCutBsiZ4Jv4QxmWKy2sPlp-r2OgJlAPkcULvTu1GvXavRTrXb7PXkEKO4ErdBTvu2RyA6XLR1MKs0V7mRNvBfuiMRvWRuv9NBLh6vpgRTVo5Tthh-ahSQ-Rd6QcmvVNf-rLnxWGY4nOdx6QLcYKPukQourR2o6QzxGcpb9pDc8X0p2SEtDrDijpy6usFxk +MAC_ID=sij-mbp16 +MAC_UN=sij +MAC_PW="Synchr0!" +#─── notes: ────────────────────────────────────────────────────────────────────────────── +# +# +# CF_TOKEN: a Cloudflare token. This is used on the cf router for quick +# deployment of new domains in tandem with Caddy and for ddns. +# +# VISUALCROSSING_API_KEY: used for obtaining weather forecasts. It is a very data-rich +# yet affordable source of weather info, with a generous free +# plan. +# +# ELEVENLABS_API_KEY: used when on the tts router if tts tasks are outsourced to +# the state-of-the-art models at ElevenLabs. +# +# COURTLISTENER_API_KEY: used primarily on the hooks router, but likely relevant only +# to legal professionals that will be aware what it is for. +# +# TIMING_API_URL: are used on the time router for generating various tasks +# & related to timekeeping, as well as on the notes router for +# TIMING_API_KEY: generating markdown-formatted timeslips. It requires an +# active subscription to the Timing app (macOS or web), but +# it's worth noting comes included in the SetApp subscribtion +# bundle, for the same price, last I checked, as subscribing to +# Timing alone. If you have a Mac and somehow don't know this +# already, SetApp is an utterly insane value. I pay $15/mo for +# apps that I would otherwise pay ~$100/mo for if subscribing +# individually. I want to say I wasn't paid to say this, but +# with those savings I almost feel like I was. +# +# MAC_ID: These last three variables are for a specific use case where +# MAC_UN: you want certain commands run, or alerts appearing, on a +# MAD_PW: designated macaOS computer. The alerts router is designed to +# deliver OS-level notifications to the specified Mac when a +# webhook gets a hit on specified keywords within the payload. +# Setting the MAC_ID to the TS_ID of the target Mac, allows +# the script to readily know whether it itself is the target +# (this is relevant in a load-balancing context), and how to +# reach the target if not — to wit, ssh using MagicDNS. +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dbfc9e9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,47 @@ +python-dotenv +setuptools +PyPDF2 +fastapi +pdf2image +pdfminer +pytesseract +python-dateutil +python-docx +hypercorn +starlette +httpx +pydantic +pytz +requests +aiohttp +paramiko +tailscale +pandas +pydub +torch +selenium +webdriver_manager +faster_whisper +filetype +html2text +markdown +ollama +aiofiles +bs4 +imbox +newspaper3k +python-magic +urllib3 +whisper +huggingface_hub +numpy +tqdm +tiktoken +numba +scipy +vectordb +IPython +torchaudio +lxml +lxml_html_clean +pdfminer.six diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..64853f5 --- /dev/null +++ b/setup.py @@ -0,0 +1,61 @@ +from setuptools import setup, find_packages + +setup( + name='sijapi', + version='0.1', + packages=find_packages(), + entry_points={ + 'console_scripts': [ + 'sijapi = sijapi.__main__:main', + ], + }, + install_requires=[ + 'fastapi', + 'python-dotenv', + 'hypercorn', + 'setuptools', + 'PyPDF2', + 'pdf2image', + 'pdfminer', + 'pytesseract', + 'python-dateutil', + 'python-docx', + 'starlette', + 'httpx', + 'pydantic', + 'pytz', + 'requests', + 'aiohttp', + 'paramiko', + 'tailscale', + 'pandas', + 'pydub', + 'torch', + 'selenium', + 'webdriver_manager', + 'faster_whisper', + 'filetype', + 'html2text', + 'markdown', + 'ollama', + 'aiofiles', + 'bs4', + 'pdfminer.six', + 'lxml_html_clean', + 'imbox', + 'newspaper3k', + 'python-magic', + 'urllib3', + 'whisper', + 'huggingface_hub', + 'numpy', + 'tqdm', + 'tiktoken', + 'numba', + 'scipy', + 'vectordb', + 'IPython', + 'torchaudio' + ], +) + diff --git a/sij.asc b/sij.asc new file mode 100644 index 0000000..d7a1822 --- /dev/null +++ b/sij.asc @@ -0,0 +1,92 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQINBGY+fL4BEADCpz8FAfa6/7i9mEQCYlwwP2k9DlrUzz+u9BL4BmuoTEcGty9M +7EA2ivRxXo371IIMjL/GyAa8I3WHMEhxuRlGldUQaHzo6PicTn+OiLJ/g2vCfStN +jIYog3WC25P7Es1n1hDuOu8rUL93twXZ4NevgYx+G44M7Q+/1AbSXf83kpawlHhg +HcGmH2vt9UulfTGAvN9s2sH2pn89812lpWLSdPARNw09ePZy4RdiEgJ6t+S+wjaE +Ue/H4FcQC1MLrQnkW5soUOduY9HN0iUk/xZqqkRQctl3ds5oInE483vQsL0HKFvs +MB8lBdXTbVzxvpFe+fvT8d6hiZ/YgxIUEl1KZLDd3atqj+UREuG+LABZUKC4nSUP +EXneXUqi4qVCW9827K9/H+IKahe8OE+OrZAsSfLtsp4AznIxgyQbvpUZzCuRASJN +Kt1cjcJBOv5L0HJ8tVykZd23WuKUXiyxTs1MxsDGyjew30IsAg4WNO/iw9vBO/Yu +pfjlZTcgbghdIuNmOrnCyzKWtUxxfDtWwEBBshbTKusOaGhauBaHrRVE7lKlTblM +x1JIzYBziDmFy25J1XvYb3guilk1yy54poLQaEcE54mQYWHKCNS4eQeL5dJR3Nmu +Pt9GXdMyNO3uyog3WYpyYqch+osbBsHFVNUClxMycnyqZzHQeZHPNJBzJwARAQAB +tC5TYW5neWUgSW5jZS1Kb2hhbm5zZW4gKEF0dG9ybmV5KSA8c2lqQHNpai5sYXc+ +iQJXBBMBCABBAhsDBQkHhh8tBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAFiEEMjqK +LEezdiJLNhO3U1smWu2+W0QFAmY+fPUCGQEACgkQU1smWu2+W0SwBQ/+L5S1fIop +6iQ/6gQENBNCUVgACWP0/ViJzQGo4iF3UZkV5KV8pgk/TenZSXCLxUj6UpSAe25m +vtrGV4NCL2hLn1NPK11Na6IM1ykfh/L67NKeCqmtQYwNLwW0o0fvUpK9fahPxhmv +EFo+lVCabQndgzmLxnUhxH4qkGSejsaSFoJQ6fVl/DExCL4w/R5rStnRMKDtkuF1 +ONfjZpuLrAylx8Ypf/rocQYn5AJcRD5ZL2bGgDZNe85VNBFmD3b2cGSVpm3J6Rg/ +fPfs1lgtpgXWbBDCF8nRY326Utbr3qoeZUXVQjVZ05Q2SpUYFHiDZJ3EFwQikg5n +cIBfcXQZQhTq/OK0eS0vB1li8m1ce9m8iMC+Pxe5toPkxFV5RO1+o5PG1SyOfzfV +F1c0O9JQqdJzRHoTuqLtVhlmRVBU2d6TjWYlZ6TwPShSTLu0Tkm4EeFJS4oag75d +q7LlIIvrWS4n3CqVpC/PEIUtclytkOkvNQaSWHEVkappS3UjkX1BJmaI8zXYh9jh +sV/5FckvwYnky+w6geFOBs34NW0rg9oNw4KNAywYcOPbI/Ev1z57my+MpA5msw+B +ww9sFC+tzQCSJl0FU2Dg2YMnyqfUtGr9HfXdAGuuUVh+cYFmEdwwZqBWl37pNIGL +SxfF1AdrlHCSpJcLVETe80UraMFAI7tyOwe0L1Nhbmd5ZSBJbmNlLUpvaGFubnNl +biA8c2FuZ3llaWpAd2VzdGVybmxhdy5vcmc+iQJUBBMBCAA+FiEEMjqKLEezdiJL +NhO3U1smWu2+W0QFAmY+fOgCGwMFCQeGHy0FCwkIBwIGFQoJCAsCBBYCAwECHgEC +F4AACgkQU1smWu2+W0RlnBAArwaFta9NTRdubTqctv1EET1D9OXAE/R5vdSk2jRQ +1CMYmv6KeMm0Rl7+dNFet/vJOEtITF7TZHnt7WBy7n5m+SIoARsaZYEchjZKsE2g +6RvRWqFGYuUYQWTRKsw0b2tT16BaNLKdV/w3ndRQNS6wDJrW1dRnIWxm4z26d3/H +Rt3o8+LUVxdSWGLliKZU00S+FNPVSwWe/X7+CoIE7T5XZL+OIEJ6DfpK2pkHKT6D +FswF3KOLG36vz5eISk4AT+o9AEoFIpX0hce3DMixEYQSgKN230K8RchC59bO81zE +w7Mic4vpn/wKFhicn+0BA1aJzzOd8iEwiA0p5baq4b2xIwCBiO4uv/HXR1SN1Tfk +QozjAGzl8LzrmwGTWOtOSk/7ckPhPR2MGNhMdtJ7rPeHxImJLh+/f4uBmYnQUdw4 +0j3sMpJmrShW5dXJ8YHqVFfqabYD8HkBztdYI0qGJDpQjEbW6V+DvMWQXOZ8c1ul +NN2vZyY25RkypMQLiphImJa+q6eGtBEas40MeAkgQKIBPBBpb6W1km+m6UnOADKB +0/vOWcZMgijyMPp7WvwXbOwmXI27rHsUTvhFDLPI113a9I5bU8j6VyW2s/sst3Xc +OQDzEgR3KvD4dWjczIg6yliIq9eM5hskpsYyfDfWRWrIbR3Tg8XPwnQRB9dPEHIy +rKS0KVNhbmd5ZSBJbmNlLUpvaGFubnNlbiA8c2FuZ3llQHJpc2V1cC5uZXQ+iQJU +BBMBCAA+FiEEMjqKLEezdiJLNhO3U1smWu2+W0QFAmY+fQYCGwMFCQeGHy0FCwkI +BwIGFQoJCAsCBBYCAwECHgECF4AACgkQU1smWu2+W0SKGA//VRGpS7IwOOlHF7OI ++LEMDebLpLB2PswfWrK+sI9YdXXV/CaT0NcCz2HPCoK+coaDkl4cxh10ykVbjO36 +wZc/rvhpzga2wMLpBLNwpTvSlfMwsCQeRQay498bgdR59gf1hYa/dPYKKrBgNxHa +Kc3dMDWU0adpV4zV1s/iFNQQZfmhUah+8TTlB03hahPzn8V7CqQF+jTfSXiWPv/V +eD1W6Sc1juvLTVxTThbM5ewiIhMP2t7KM+M4viOEqce79IcE2HTcpCaEI7Lh/Eld +9VBZZk/gENuPqyQuLbOIOQhC6LYRZkZC9Vv1FDutfWV5ZBPyaTY/n5pGW3lo+Tfa +FLSamQcD6dyiGm/ZyQbPUDt2aWhqRGr7VvvtfyXLazL9T9Y6ASr5UjLakPr5ihUz +B8InRch9ACPbu7QSIGFk9PQgHme2Cd/HMRLIALnkAmrafgDE+14Rlp9qI2nYhWdD +jkZcLalPXQCDBxUfj1q192Nn3wlKsDkDd2RWT7Mc2RJq2FR36KADPMtz2oJPSib4 +eRgI40E9Wv+zqHDDTU2K/bLi3nmBHvKnXWXPyiBPVL+CAoAhkYHHJwNuRQfxlukq +heS4/CMBRB04foTeu2ltl6/sQdAIyBGKbOC6fMyhJFYbi16nWI6j7iw2XQnqyitu +jC8Pz14NfIAQTpKCVcV32Kn2k1+0I1Nhbmd5ZSBJbmNlLUpvaGFubnNlbiA8c2lq +QGVudi5lc3E+iQJUBBMBCAA+FiEEMjqKLEezdiJLNhO3U1smWu2+W0QFAmY+fRIC +GwMFCQeGHy0FCwkIBwIGFQoJCAsCBBYCAwECHgECF4AACgkQU1smWu2+W0Rbxw/+ +OMYnlyXvo146+3M6JGdvW36CWmc9ZcmaU+xJM3FnG91WNo5J8MnHl0Ks9BwjNWtm +VJgFEdi2EVpSLJnYdQyJILCNt8RAclYvbFHYUOIDEEC2yr5ZKt/odwYAXPxaqQ4O +Sj7R2GbLA52O8zGWfARBAnAQycrlBRjItdpzGeWgRST8O/ot/IkU7xsAKW72E2VB +9jlCahp5c01lEideVqzVhk3z6GzVz1NUKsglgEOmTIjld4mMs+4GX/93q0u1erKO +I7Q6RL6lfdc2opGi5jFMXGWhLLgX2SSsBFJRuSQGnTpbx3XWFS5uA+cku7Fh0fC0 +MKr2vsY18Z6OqU0MdQm6ovIVcvhzIdGfnBU9Ct98DMiUhDCmx3o9XneWj1n7kWKM +gT8s8AvE27tidtkZApwIKHdUy6qfyqwRjxE+KdL6Eh48x3TVYep+wfSfPJ1eq9Ne +7WWXKUx6FGNH01hpQdTLbCYqmwMa03och1wwyi+0wc8rHe6k6y2tURtP3mINkDeV +u1QmVaGRDA2r7oDm9UsFeupGsbFBnTkQIfJgnrLRJFfN2FDJPZDcd/VS71AOSL5C +jY+Dr/WHYPWeN8MHXfG4r/P41wsrnAJEAzSvLRQ9GYCLPe825W+uDJx9eMePodFa +BeIBcM633WXpbIXHnRQhPDfTzejCejO6GoPE7PbtBBi5Ag0EZj58vgEQAPUqNOgQ +kAPd/S+nkGAfvnzC5UD6lVWaQTVL/xU2K1J8l11K5Ck4nq7oMKhzEitu0lA+jG7q +JVwXMj9+rnoGlbIYmmxCZYpSit930Mss3HjYU8IAF4nybGwc5+wO77cldk3WJTI0 +EkFgiM4Jk6Gk/tRf1LgMIfJIUgm8MooPLqg2z5Pj+bbwxw42A20enEwtF3ivEETJ +wuJwsp5uCOAfzOGqqBvp19PMTPynUBuwEXCkJfb0CCz+5yhjoi6ZjCVXxjuoe2wN +jFwoYd8odfSuvC6Fh9qqXnjF7HZLxEyN7K1L/y/sWarsN01zbUUI3kZlnTuamDu4 +LdZtl2q3QqDyxmzHIWLTa1qL0s3WooB7JJqBYaNmQjLHadoktZ4vfhl7kjXYsg+i +84oipL83u2cRHplpqnRk9qVwNdW01EObjNafWY6t3942sM4e/yOdQiaXlxivPuHV +VYwme6K53lmGcV3ipMWRpNkme+oKV/TdYTTdlDaLgC8ga5AW6poNoSp5UpNeOs0E +mxIZivpRQSCr3g+jScy0RdX/+tI1gWe+2ZIHFwR+1WsXvLXHyd1wVyH4vDxSf1bE +VRVsXLZDT/xMGDzNzAC76kzoIykrcndFiTbNzB/LjZJuls6fRdN07bTcymWEKYiP +Ia6iGdag6+ueoX4eDzbjCvldKtkfr/EhB7MfABEBAAGJAjwEGAEIACYWIQQyOoos +R7N2Iks2E7dTWyZa7b5bRAUCZj58vgIbDAUJB4YfLQAKCRBTWyZa7b5bRLZdEACk +AaXNVeywC9+X6bdwkKV5Jl6Hv238cGd58TuVbjd+tii1JazbKEqCAr5tTlGtrUZg +fyjM0z5sMKDSZ15paX4xDbDs+xdfMxLVdjmFlZgwTrrTSIx3ODxPo/sSeyrzGZrQ +hlZjOHP1Bvln0OTQwK0yE3Eaip0FhIpJA5FX3yrZfvza3St5leNOXsZgEri68cgf +mVhS9tBD2I9TpCVwgq5vRnloAMgtQBYr8N9glXBfs2WsPhU96HSSH88osJW+lCkG +vTtzQBEjnnSQ/ssHBYz4DfpsJe1fbM+9WVow6q2nkUhqg5TfdAt4H0ra2uPXnNz8 +lvQObVHlw7T0w5UTzgBdlCyYplyTG2gcZi+UWzit6YH9DH82j1otcq3+3NlrKwo0 +TSJKZNagiqgJNZ1mhJQTt3JDacFFkBBxLf6trruuyInRU1leo87hzHCxIlMbQPqh +ogtV+W9FHElVJwoTQi8YF+0AacZPzK8wJmlPLxBeqs+ULJ8H5wZxlEBB1Jj91/W9 +6R8m2IUZCsXNNpYU+f7uB8x0RUS3pU8S7GcwdJmOa16Xc4VdfWugm4TTEtajeSYC +ek5j/2s/QkAum5slT2Y6Aam0Jj/IhsGHKVEnR6DS01mZqVeeu0giPFUO4ZX5C0n9 +mAmw/ZUGIOj6ls3KMBHv4pqQI7nd00tW8eIMgKGgKQ== +=PhPl +-----END PGP PUBLIC KEY BLOCK----- \ No newline at end of file diff --git a/sijapi/__init__.py b/sijapi/__init__.py new file mode 100644 index 0000000..182cf2b --- /dev/null +++ b/sijapi/__init__.py @@ -0,0 +1,241 @@ +import os +from pathlib import Path +import ipaddress +import multiprocessing +from dotenv import load_dotenv +from dateutil import tz +from pathlib import Path +from pydantic import BaseModel +import traceback +import logging +from . import logs + +### Logs ### +HYPERCORN_LOG_LEVEL = None +LOGGER = logging.getLogger('LOGGER') +def DEBUG(d): LOGGER.debug(d) +def INFO(i): LOGGER.debug(i) +def WARN(w): LOGGER.warning(w) +def ERR(e): + LOGGER.error(e) + LOGGER.error(traceback.format_exc()) +def CRITICAL(c): + LOGGER.critical(c) + LOGGER.critical(traceback.format_exc()) + +# from sijapi.config.config import load_config +# cfg = load_config() + +### Initial initialization +BASE_DIR = Path(__file__).resolve().parent +CONFIG_DIR = BASE_DIR / "config" +ENV_PATH = CONFIG_DIR / ".env" +load_dotenv(ENV_PATH) + +### API essentials +ROUTERS = os.getenv('ROUTERS', '').split(',') +PUBLIC_SERVICES = os.getenv('PUBLIC_SERVICES', '').split(',') +GLOBAL_API_KEY = os.getenv("GLOBAL_API_KEY") +# HOST_NET and HOST_PORT comprise HOST, which is what the server will bind to +HOST_NET = os.getenv("HOST_NET", "127.0.0.1") +HOST_PORT = int(os.getenv("HOST_PORT", 4444)) +HOST = f"{HOST_NET}:{HOST_PORT}" +LOCAL_HOSTS = [ipaddress.ip_address(localhost.strip()) for localhost in os.getenv('LOCAL_HOSTS', '127.0.0.1').split(',')] + ['localhost'] +SUBNET_BROADCAST = os.getenv("SUBNET_BROADCAST", '10.255.255.255') +TRUSTED_SUBNETS = [ipaddress.ip_network(subnet.strip()) for subnet in os.getenv('TRUSTED_SUBNETS', '127.0.0.1/32').split(',')] +MAX_CPU_CORES = min(int(os.getenv("MAX_CPU_CORES", int(multiprocessing.cpu_count()/2))), multiprocessing.cpu_count()) + +### Directories & general paths +HOME_DIR = Path.home() +ROUTER_DIR = BASE_DIR / "routers" +DATA_DIR = BASE_DIR / "data" +os.makedirs(DATA_DIR, exist_ok=True) +ALERTS_DIR = DATA_DIR / "alerts" +os.makedirs(ALERTS_DIR, exist_ok=True) +LOGS_DIR = BASE_DIR / "logs" +os.makedirs(LOGS_DIR, exist_ok=True) +REQUESTS_DIR = LOGS_DIR / "requests" +os.makedirs(REQUESTS_DIR, exist_ok=True) +REQUESTS_LOG_PATH = LOGS_DIR / "requests.log" + + +### Databases +DB = os.getenv("DB", 'sijdb') +DB_HOST = os.getenv("DB_HOST", "127.0.0.1") +DB_PORT = os.getenv("DB_PORT", 5432) +DB_USER = os.getenv("DB_USER", 'sij') +DB_PASS = os.getenv("DB_PASS") +DB_SSH = os.getenv("DB_SSH", "100.64.64.15") +DB_SSH_USER = os.getenv("DB_SSH_USER") +DB_SSH_PASS = os.getenv("DB_SSH_ENV") +DB_URL = f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB}' + + +### LOCATE AND WEATHER LOCALIZATIONS +USER_FULLNAME = os.getenv('USER_FULLNAME') +USER_BIO = os.getenv('USER_BIO') +TZ = tz.gettz(os.getenv("TZ", "America/Los_Angeles")) +HOME_ZIP = os.getenv("HOME_ZIP") # unimplemented +LOCATION_OVERRIDES = DATA_DIR / "loc_overrides.json" +LOCATIONS_CSV = DATA_DIR / "US.csv" +# DB = DATA_DIR / "weatherlocate.db" # deprecated +VISUALCROSSING_BASE_URL = os.getenv("VISUALCROSSING_BASE_URL", "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline") +VISUALCROSSING_API_KEY = os.getenv("VISUALCROSSING_API_KEY") + + +### Obsidian & notes +OBSIDIAN_VAULT_DIR = Path(os.getenv("OBSIDIAN_BASE_DIR") or HOME_DIR / "Nextcloud" / "notes") +OBSIDIAN_JOURNAL_DIR = OBSIDIAN_VAULT_DIR / "journal" +OBSIDIAN_RESOURCES_DIR = "obsidian/resources" +OBSIDIAN_BANNER_DIR = f"{OBSIDIAN_RESOURCES_DIR}/banners" +os.makedirs(Path(OBSIDIAN_VAULT_DIR) / OBSIDIAN_BANNER_DIR, exist_ok=True) +OBSIDIAN_BANNER_SCENE = os.getenv("OBSIDIAN_BANNER_SCENE", "wallpaper") +OBSIDIAN_CHROMADB_COLLECTION = os.getenv("OBSIDIAN_CHROMADB_COLLECTION", "obsidian") +DOC_DIR = DATA_DIR / "docs" +os.makedirs(DOC_DIR, exist_ok=True) + +### DATETIME SCHEMA FOR DAILY NOTE FOLDER HIERARCHY FORMATTING ### +YEAR_FMT = os.getenv("YEAR_FMT") +MONTH_FMT = os.getenv("MONTH_FMT") +DAY_FMT = os.getenv("DAY_FMT") +DAY_SHORT_FMT = os.getenv("DAY_SHORT_FMT") + +### Large language model +LLM_URL = os.getenv("LLM_URL", "http://localhost:11434") +LLM_SYS_MSG = os.getenv("SYSTEM_MSG", "You are a helpful AI assistant.") +SUMMARY_INSTRUCT = os.getenv('SUMMARY_INSTRUCT', "You are an AI assistant that provides accurate summaries of text -- nothing more and nothing less. You must not include ANY extraneous text other than the sumary. Do not include comments apart from the summary, do not preface the summary, and do not provide any form of postscript. Do not add paragraph breaks. Do not add any kind of formatting. Your response should begin with, consist of, and end with an accurate plaintext summary.") +SUMMARY_INSTRUCT_TTS = os.getenv('SUMMARY_INSTRUCT_TTS', "You are an AI assistant that provides email summaries for Sanjay. Your response will undergo Text-To-Speech conversion and added to Sanjay's private podcast. Providing adequate context (Sanjay did not send this question to you, he will only hear your response) but aiming for conciseness and precision, and bearing in mind the Text-To-Speech conversion (avoiding acronyms and formalities), summarize the following email.") +DEFAULT_LLM = os.getenv("DEFAULT_LLM", "dolphin-mistral") +DEFAULT_VISION = os.getenv("DEFAULT_VISION", "llava") +DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "Luna") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + +### Stable diffusion +SD_IMAGE_DIR = DATA_DIR / "sd" / "images" +os.makedirs(SD_IMAGE_DIR, exist_ok=True) +SD_WORKFLOWS_DIR = DATA_DIR / "sd" / "workflows" +os.makedirs(SD_WORKFLOWS_DIR, exist_ok=True) +COMFYUI_URL = os.getenv('COMFYUI_URL', "http://localhost:8188") +COMFYUI_DIR = Path(os.getenv('COMFYUI_DIR')) +COMFYUI_OUTPUT_DIR = COMFYUI_DIR / 'output' +COMFYUI_LAUNCH_CMD = os.getenv('COMFYUI_LAUNCH_CMD', 'mamba activate comfyui && python main.py') + +### Summarization +SUMMARY_CHUNK_SIZE = int(os.getenv("SUMMARY_CHUNK_SIZE", 4000)) # measured in tokens +SUMMARY_CHUNK_OVERLAP = int(os.getenv("SUMMARY_CHUNK_OVERLAP", 100)) # measured in tokens +SUMMARY_TPW = float(os.getenv("SUMMARY_TPW", 1.3)) # measured in tokens +SUMMARY_LENGTH_RATIO = int(os.getenv("SUMMARY_LENGTH_RATIO", 4)) # measured as original to length ratio +SUMMARY_MIN_LENGTH = int(os.getenv("SUMMARY_MIN_LENGTH", 150)) # measured in tokens +SUMMARY_INSTRUCT = os.getenv("SUMMARY_INSTRUCT", "Summarize the provided text. Respond with the summary and nothing else. Do not otherwise acknowledge the request. Just provide the requested summary.") +SUMMARY_MODEL = os.getenv("SUMMARY_MODEL", "llama3") +SUMMARY_TOKEN_LIMIT = int(os.getenv("SUMMARY_TOKEN_LIMIT", 4096)) + +### ASR +ASR_DIR = DATA_DIR / "asr" +os.makedirs(ASR_DIR, exist_ok=True) +WHISPER_CPP_DIR = HOME_DIR / str(os.getenv("WHISPER_CPP_DIR")) +WHISPER_CPP_MODELS = os.getenv('WHISPER_CPP_MODELS', 'NULL,VOID').split(',') + +### TTS +PREFERRED_TTS = os.getenv("PREFERRED_TTS", "None") +TTS_DIR = DATA_DIR / "tts" +os.makedirs(TTS_DIR, exist_ok=True) +VOICE_DIR = TTS_DIR / 'voices' +os.makedirs(VOICE_DIR, exist_ok=True) +PODCAST_DIR = TTS_DIR / "sideloads" +os.makedirs(PODCAST_DIR, exist_ok=True) +TTS_OUTPUT_DIR = TTS_DIR / 'outputs' +os.makedirs(TTS_OUTPUT_DIR, exist_ok=True) +TTS_SEGMENTS_DIR = TTS_DIR / 'segments' +os.makedirs(TTS_SEGMENTS_DIR, exist_ok=True) +ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") + +### Calendar & email account +MS365_TOGGLE = True if os.getenv("MS365_TOGGLE") == "True" else False +ICAL_TOGGLE = True if os.getenv("ICAL_TOGGLE") == "True" else False +ICS_PATH = DATA_DIR / 'calendar.ics' # deprecated now, but maybe revive? +ICALENDARS = os.getenv('ICALENDARS', 'NULL,VOID').split(',') +class IMAP_DETAILS(BaseModel): + email: str + password: str + host: str + imap_port: int + smtp_port: int + imap_encryption: str = None + smtp_encryption: str = None + +IMAP = IMAP_DETAILS( + email = os.getenv('IMAP_EMAIL'), + password = os.getenv('IMAP_PASSWORD'), + host = os.getenv('IMAP_HOST', '127.0.0.1'), + imap_port = int(os.getenv('IMAP_PORT', 1143)), + smtp_port = int(os.getenv('SMTP_PORT', 469)), + imap_encryption = os.getenv('IMAP_ENCRYPTION', None), + smtp_encryption = os.getenv('SMTP_ENCRYPTION', None) +) +AUTORESPONSE_WHITELIST = os.getenv('AUTORESPONSE_WHITELIST', '').split(',') +AUTORESPONSE_BLACKLIST = os.getenv('AUTORESPONSE_BLACKLIST', '').split(',') +AUTORESPONSE_BLACKLIST.extend(["no-reply@", "noreply@", "@uscourts.gov", "@doi.gov"]) +AUTORESPONSE_CONTEXT = os.getenv('AUTORESPONSE_CONTEXT', None) +AUTORESPOND = AUTORESPONSE_CONTEXT != None + +### Courtlistener & other webhooks +COURTLISTENER_DOCKETS_DIR = DATA_DIR / "courtlistener" / "dockets" +os.makedirs(COURTLISTENER_DOCKETS_DIR, exist_ok=True) +COURTLISTENER_SEARCH_DIR = DATA_DIR / "courtlistener" / "cases" +os.makedirs(COURTLISTENER_SEARCH_DIR, exist_ok=True) +CASETABLE_PATH = DATA_DIR / "courtlistener" / "cases.json" +COURTLISTENER_API_KEY = os.getenv("COURTLISTENER_API_KEY") +COURTLISTENER_BASE_URL = os.getenv("COURTLISTENER_BASE_URL", "https://www.courtlistener.com") +COURTLISTENER_DOCKETS_URL = "https://www.courtlistener.com/api/rest/v3/dockets/" + +### Keys & passwords +PUBLIC_KEY_FILE = os.getenv("PUBLIC_KEY_FILE", 'you_public_key.asc') +PUBLIC_KEY = (BASE_DIR.parent / PUBLIC_KEY_FILE).read_text() +MAC_ID = os.getenv("MAC_ID") +MAC_UN = os.getenv("MAC_UN") +MAC_PW = os.getenv("MAC_PW") +TIMING_API_KEY = os.getenv("TIMING_API_KEY") +TIMING_API_URL = os.getenv("TIMING_API_URL", "https://web.timingapp.com/api/v1") +PHOTOPRISM_URL = os.getenv("PHOTOPRISM_URL") +PHOTOPRISM_USER = os.getenv("PHOTOPRISM_USER") +PHOTOPRISM_PASS = os.getenv("PHOTOPRISM_PASS") + +### Tailscale +TS_IP = ipaddress.ip_address(os.getenv("TS_IP", "NULL")) +TS_SUBNET = ipaddress.ip_network(os.getenv("TS_SUBNET")) if os.getenv("TS_SUBNET") else None +TS_ID = os.getenv("TS_ID", "NULL") +TS_TAILNET = os.getenv("TS_TAILNET", "NULL") +TS_ADDRESS = f"http://{TS_ID}.{TS_TAILNET}.ts.net" + +### Cloudflare +CF_API_BASE_URL = os.getenv("CF_API_BASE_URL") +CF_TOKEN = os.getenv("CF_TOKEN") +CF_IP = DATA_DIR / "cf_ip.txt" # to be deprecated soon +CF_DOMAINS_PATH = DATA_DIR / "cf_domains.json" # to be deprecated soon + +### Caddy - not fully implemented +BASE_URL = os.getenv("BASE_URL") +CADDY_SERVER = os.getenv('CADDY_SERVER', None) +CADDYFILE_PATH = os.getenv("CADDYFILE_PATH", "") if CADDY_SERVER is not None else None +CADDY_API_KEY = os.getenv("CADDY_API_KEY") + + +### Microsoft Graph +MS365_CLIENT_ID = os.getenv('MS365_CLIENT_ID') +MS365_SECRET = os.getenv('MS365_SECRET') +MS365_TENANT_ID = os.getenv('MS365_TENANT_ID') +MS365_CERT_PATH = CONFIG_DIR / 'MS365' / '.cert.pem' # deprecated +MS365_KEY_PATH = CONFIG_DIR / 'MS365' / '.cert.key' # deprecated +MS365_KEY = MS365_KEY_PATH.read_text() +MS365_TOKEN_PATH = CONFIG_DIR / 'MS365' / '.token.txt' +MS365_THUMBPRINT = os.getenv('MS365_THUMBPRINT') + +MS365_LOGIN_URL = os.getenv("MS365_LOGIN_URL", "https://login.microsoftonline.com") +MS365_AUTHORITY_URL = f"{MS365_LOGIN_URL}/{MS365_TENANT_ID}" +MS365_REDIRECT_PATH = os.getenv("MS365_REDIRECT_PATH", "https://api.sij.ai/o365/oauth_redirect") +MS365_SCOPE = os.getenv("MS365_SCOPE", 'Calendars.Read,Calendars.ReadWrite,offline_access').split(',') + +### Maintenance +GARBAGE_COLLECTION_INTERVAL = 60 * 60 # Run cleanup every hour +GARBAGE_TTL = 60 * 60 * 24 # Delete files older than 24 hours \ No newline at end of file diff --git a/sijapi/__main__.py b/sijapi/__main__.py new file mode 100755 index 0000000..1aa0fb2 --- /dev/null +++ b/sijapi/__main__.py @@ -0,0 +1,133 @@ +#!/Users/sij/miniforge3/envs/api/bin/python +from fastapi import FastAPI, Request, HTTPException, Response +from fastapi.responses import JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import ClientDisconnect +from hypercorn.asyncio import serve +from hypercorn.config import Config +import sys +import asyncio +import httpx +import argparse +import json +import ipaddress +import importlib +from dotenv import load_dotenv +from pathlib import Path +from datetime import datetime +from . import logs +parser = argparse.ArgumentParser(description='Personal API.') +parser.add_argument('--debug', action='store_true', help='Set log level to DEBUG') +parser.add_argument('--test', type=str, help='Load only the specified module.') +args = parser.parse_args() +logs.setup("debug") +from sijapi import DEBUG, INFO, WARN, ERR, CRITICAL + +from sijapi import HOST, ENV_PATH, GLOBAL_API_KEY, REQUESTS_DIR, ROUTER_DIR, REQUESTS_LOG_PATH, PUBLIC_SERVICES, TRUSTED_SUBNETS, ROUTERS + + +# Initialize a FastAPI application +api = FastAPI() + + +# CORSMiddleware +api.add_middleware( + CORSMiddleware, + allow_origins=['*'], + allow_credentials=True, + allow_methods=['*'], + allow_headers=['*'], +) + +class SimpleAPIKeyMiddleware(BaseHTTPMiddleware): + async def dispatch(self, request: Request, call_next): + client_ip = ipaddress.ip_address(request.client.host) + if request.method == "OPTIONS": + # Allow CORS preflight requests + return JSONResponse(status_code=200) + if request.url.path not in PUBLIC_SERVICES: + if not any(client_ip in subnet for subnet in TRUSTED_SUBNETS): + api_key_header = request.headers.get("Authorization") + api_key_query = request.query_params.get("api_key") + if api_key_header: + api_key_header = api_key_header.lower().split("bearer ")[-1] + if api_key_header != GLOBAL_API_KEY and api_key_query != GLOBAL_API_KEY: + WARN(f"Invalid API key provided by a requester.") + return JSONResponse( + status_code=401, + content={"detail": "Invalid or missing API key"} + ) + response = await call_next(request) + # DEBUG(f"Request from {client_ip} is complete") + return response + +api.add_middleware(SimpleAPIKeyMiddleware) + +canceled_middleware = """ +@api.middleware("http") +async def log_requests(request: Request, call_next): + DEBUG(f"Incoming request: {request.method} {request.url}") + DEBUG(f"Request headers: {request.headers}") + DEBUG(f"Request body: {await request.body()}") + response = await call_next(request) + return response + +async def log_outgoing_request(request): + INFO(f"Outgoing request: {request.method} {request.url}") + DEBUG(f"Request headers: {request.headers}") + DEBUG(f"Request body: {request.content}") +""" + +@api.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + ERR(f"HTTP Exception: {exc.status_code} - {exc.detail}") + ERR(f"Request: {request.method} {request.url}") + return JSONResponse(status_code=exc.status_code, content={"detail": exc.detail}) + +@api.middleware("http") +async def handle_exception_middleware(request: Request, call_next): + try: + response = await call_next(request) + except RuntimeError as exc: + if str(exc) == "Response content longer than Content-Length": + # Update the Content-Length header to match the actual response content length + response.headers["Content-Length"] = str(len(response.body)) + else: + raise + return response + + + +def load_router(router_name): + router_file = ROUTER_DIR / f'{router_name}.py' + DEBUG(f"Attempting to load {router_name.capitalize()}...") + if router_file.exists(): + module_path = f'sijapi.routers.{router_name}' + try: + module = importlib.import_module(module_path) + router = getattr(module, router_name) + api.include_router(router) + INFO(f"{router_name.capitalize()} router loaded.") + except (ImportError, AttributeError) as e: + CRITICAL(f"Failed to load router {router_name}: {e}") + else: + WARN(f"Router file for {router_name} does not exist.") + +def main(argv): + if args.test: + load_router(args.test) + else: + CRITICAL(f"sijapi launched") + CRITICAL(f"{args._get_args}") + for router_name in ROUTERS: + load_router(router_name) + + config = Config() + config.keep_alive_timeout = 1200 + config.bind = [HOST] + asyncio.run(serve(api, config)) + +if __name__ == "__main__": + main(sys.argv[1:]) \ No newline at end of file diff --git a/sijapi/config/config.py b/sijapi/config/config.py new file mode 100644 index 0000000..6fc0b72 --- /dev/null +++ b/sijapi/config/config.py @@ -0,0 +1,98 @@ +import os +import yaml +from time import sleep +from pathlib import Path +import ipaddress + +import yaml + +class Config: + def __init__(self, yaml_file): + with open(yaml_file, 'r') as file: + self.data = yaml.safe_load(file) + + def __getattr__(self, name): + if name in self.data: + value = self.data[name] + if isinstance(value, dict): + return ConfigSection(value) + return value + raise AttributeError(f"Config has no attribute '{name}'") + +class ConfigSection: + def __init__(self, data): + self.data = data + + def __getattr__(self, name): + if name in self.data: + value = self.data[name] + if isinstance(value, dict): + return ConfigSection(value) + return value + raise AttributeError(f"ConfigSection has no attribute '{name}'") + + def __setattr__(self, name, value): + if name == 'data': + super().__setattr__(name, value) + else: + self.data[name] = value + +# Load the YAML configuration file +CFG = Config('.config.yaml') + +# Access existing attributes +print(CFG.API.PORT) # Output: localhost + +def load_config(): + yaml_file = os.path.join(os.path.dirname(__file__), ".config.yaml") + + HOME_DIR = Path.home() + BASE_DIR = Path(__file__).resolve().parent.parent + CONFIG_DIR = BASE_DIR / "config" + ROUTER_DIR = BASE_DIR / "routers" + + DATA_DIR = BASE_DIR / "data" + os.makedirs(DATA_DIR, exist_ok=True) + + ALERTS_DIR = DATA_DIR / "alerts" + os.makedirs(ALERTS_DIR, exist_ok=True) + + LOGS_DIR = BASE_DIR / "logs" + os.makedirs(LOGS_DIR, exist_ok=True) + REQUESTS_DIR = LOGS_DIR / "requests" + os.makedirs(REQUESTS_DIR, exist_ok=True) + REQUESTS_LOG_PATH = LOGS_DIR / "requests.log" + DOC_DIR = DATA_DIR / "docs" + os.makedirs(DOC_DIR, exist_ok=True) + SD_IMAGE_DIR = DATA_DIR / "sd" / "images" + os.makedirs(SD_IMAGE_DIR, exist_ok=True) + SD_WORKFLOWS_DIR = DATA_DIR / "sd" / "workflows" + + + + try: + with open(yaml_file, 'r') as file: + config_data = yaml.safe_load(file) + + vars = { + + + "API": { + + } + } + + + config = Config(config_data) + return config + except Exception as e: + print(f"Error while loading configuration: {e}") + return None + +def reload_config(): + while True: + global config + with open('config.yaml', 'r') as file: + config_data = yaml.safe_load(file) + config = Config(config_data) + sleep(300) # reload every 5 minutes \ No newline at end of file diff --git a/sijapi/config/llms.json-example b/sijapi/config/llms.json-example new file mode 100644 index 0000000..c75165d --- /dev/null +++ b/sijapi/config/llms.json-example @@ -0,0 +1,151 @@ +{ + "Alpaca": { + "models": [ + "mythomax", + "openhermes", + "deepseek" + ], + "prefix": "\n### Instruction:\n", + "stops": [ + "### Instruction" + ], + "suffix": "\n### Response:\n", + "sysPrefix": "### System\n", + "sysSuffix": "\n" + }, + "Amazon": { + "models": [ + "mistrallite" + ], + "prefix": "<|prompter|>", + "stops": [ + "<|prompter|>", + "" + ], + "suffix": "<|assistant|>", + "sysPrefix": "", + "sysSuffix": "" + }, + "ChatML": { + "models": [ + "dolphin", + "capybara", + "nous-hermes-2" + ], + "prefix": "<|im_end|>\n<|im_start|>user\n", + "stops": [ + "<|im_end|>", + "<|im_start|>" + ], + "suffix": "<|im_end|>\n<|im_start|>assistant\n", + "sysPrefix": "<|im_start|>system\n", + "sysSuffix": "<|im_end|>" + }, + "Llama2": { + "models": [ + "llama2-placeholder" + ], + "prefix": "\n\n[INST] ", + "stops": [ + "[/INST]", + "[INST]" + ], + "suffix": "[/INST]\n\n", + "sysPrefix": "", + "sysSuffix": "\n\n" + }, + "Mistral": { + "models": [ + "mistral-instruct", + "mixtral-8x7b-instruct" + ], + "prefix": "\n[INST] ", + "stops": [ + "[/INST]", + "[INST]", + "" + ], + "suffix": "[/INST]\n", + "sysPrefix": "", + "sysSuffix": "\n" + }, + "Orca": { + "models": [ + "upstage", + "neural", + "solar", + "SOLAR" + ], + "prefix": "\n### User:\n", + "stops": [ + "###", + "User:" + ], + "suffix": "\n### Assistant:\n", + "sysPrefix": "### System:\n", + "sysSuffix": "\n" + }, + "Phi2": { + "models": [ + "phi-2" + ], + "prefix": "\nSangye: ", + "stops": [ + "###", + "User Message" + ], + "suffix": "\nAssistant: ", + "sysPrefix": "Systen: ", + "sysSuffix": "\n" + }, + "Phind": { + "models": [ + "phind" + ], + "prefix": "\n### User Message\n", + "stops": [ + "###", + "User Message" + ], + "suffix": "\n### Assistant\n", + "sysPrefix": "### System Prompt\n", + "sysSuffix": "\n" + }, + "Vicuna": { + "models": [ + "xwin", + "synthia", + "tess" + ], + "prefix": "\nUSER: ", + "stops": [ + "", + "USER:", + "SYSTEM:" + ], + "suffix": "\nASSISTANT: ", + "sysPrefix": "SYSTEM: ", + "sysSuffix": "\n" + }, + "Zephyr": { + "models": [ + "zephyr" + ], + "prefix": " ", + "stops": [ + "" + ], + "suffix": "\n ", + "sysPrefix": " ", + "sysSuffix": "\n" + }, + "default": { + "prefix": "\n### Instruction:\n", + "stops": [ + "### Instruction" + ], + "suffix": "\n### Response:\n", + "sysPrefix": "### System\n", + "sysSuffix": "\n" + } +} diff --git a/sijapi/config/sd-example.json b/sijapi/config/sd-example.json new file mode 100644 index 0000000..a8b6306 --- /dev/null +++ b/sijapi/config/sd-example.json @@ -0,0 +1,43 @@ +{ + "scenes": [ + { + "scene": "default", + "triggers": [""], + "API_PPrompt": "(Highly-detailed) image of ", + "API_SPrompt": "; ((masterpiece)); ((beautiful lighting)), subdued, fine detail, extremely sharp, 8k, insane detail, dynamic lighting, cinematic, best quality, ultra detailed.", + "API_NPrompt": "`oil, paint splash, oil effect, dots, paint, freckles, liquid effect, canvas frame, 3d, bad art, asian, illustrated, deformed, blurry, duplicate, bad art, bad anatomy, worst quality, low quality, watermark, FastNegativeV2, (easynegative:0.5), epiCNegative, easynegative, verybadimagenegative_v1.3, nsfw, explicit, topless`", + "LLM_SysMsg": "You are a helpful AI who assists in refining prompts that will be used to generate highly realistic images. Upon receiving a prompt, you refine it by simplifying and distilling it to its essence, retaining the most visually evocative and distinct elements from what was provided. You may infer some visual details that were not provided in the prompt, so long as they are consistent with the prompt. Always use the most visually descriptive terms possible, and avoid any vague or abstract concepts. Do not include any words or descriptions based on other senses or emotions. Strive to show rather than tell. Space is limited, so be efficient with your words.", + "LLM_PrePrompt": "Using the most visually descriptive sentence fragments, phrases, and words, distill this scene description to its essence, staying true to what it describes: ", + "workflows": [{"workflow": "turbo.json", "size": "1024x768"}] + }, + { + "scene": "portrait", + "triggers": [ + "portrait", + "profile", + "headshot" + ], + "API_PPrompt": "Highly-detailed portrait photo of ", + "API_SPrompt": "; attractive, cute, (((masterpiece))); ((beautiful lighting)), subdued, fine detail, extremely sharp, 8k, insane detail, dynamic lighting, cinematic, best quality, ultra detailed.", + "API_NPrompt": "canvas frame, 3d, ((bad art)), illustrated, deformed, blurry, duplicate, bad anatomy, worst quality, low quality, watermark, FastNegativeV2, (easynegative:0.5), epiCNegative, easynegative, verybadimagenegative_v1.3, nsfw, nude", + "LLM_SysMsg": "You are a helpful AI who assists in refining prompts that will be used to generate highly realistic portrait photos. Upon receiving a prompt, you refine it by simplifying and distilling it to its essence, retaining the most visually evocative and distinct elements from what was provided, focusing in particular on the pictured individual's eyes, pose, and other distinctive features. You may infer some visual details that were not provided in the prompt, so long as they are consistent with the rest of the prompt. Always use the most visually descriptive terms possible, and avoid any vague or abstract concepts. Do not include any words or descriptions based on other senses or emotions. Strive to show rather than tell. Space is limited, so be efficient with your words. Remember that the final product will be a still image, and action verbs are not as helpful as simple descriptions of position, appearance, background, etc.", + "LLM_PrePrompt": "Using the most visually descriptive sentence fragments, phrases, and words, distill this portrait photo to its essence: ", + "workflows": [ + { + "workflow": "selfie.json", + "size": "768x1024" + } + ] + }, + { + "scene": "wallpaper", + "triggers": ["wallpaper"], + "API_PPrompt": "Stunning widescreen image of ", + "API_SPrompt": ", masterpiece, (subtle:0.7), (nuanced:0.6), best quality, ultra detailed, ultra high resolution, 8k, (documentary:0.3), cinematic, filmic, moody, dynamic lighting, realistic, wallpaper, landscape photography, professional, earthporn, (eliot porter:0.6), (frans lanting:0.4), (daniel kordan:0.6), landscapephotography, ultra detailed, earth tones, moody", + "API_NPrompt": "FastNegativeV2, (easynegative:0.5), canvas frame, 3d, ((bad art)), illustrated, deformed, blurry, duplicate, Photoshop, video game, anime, cartoon, fake, tiling, out of frame, bad art, bad anatomy, 3d render, nsfw, worst quality, low quality, text, watermark, (Thomas Kinkade:0.5), sentimental, kitsch, kitschy, twee, commercial, holiday card, modern, futuristic, urban, comic, cartoon, FastNegativeV2, epiCNegative, easynegative, verybadimagenegative_v1.3", + "LLM_SysMsg": "You are a helpful AI who assists in generating prompts that will be used to generate highly realistic images. Always use the most visually descriptive terms possible, and avoid any vague or abstract concepts. Do not include any words or descriptions based on other senses or emotions. Strive to show rather than tell. Space is limited, so be efficient with your words.", + "LLM_PrePrompt": "Using a series of words or sentence fragments separated by commas, describe a professional landscape photograph of a striking scene of nature. You can select any place on Earth that a young model from the Pacific Northwest is likely to travel to. Focus on describing the content and composition of the image. Only use words and phrases that are visually descriptive. This model is especially fond of wild and rugged places, mountains. She favors dark muted earth tones, dramatic lighting, and interesting juxtapositions between foreground and background, or center of frame and outer frame areas. Avoid cliche situations; instread strive for nuance and originality in composition and environment.", + "workflows": [{"workflow": "landscape.json", "size": "1160x768"}] + } + ] +} diff --git a/sijapi/helpers/calendar/exportCal.scpt b/sijapi/helpers/calendar/exportCal.scpt new file mode 100644 index 0000000..8937955 Binary files /dev/null and b/sijapi/helpers/calendar/exportCal.scpt differ diff --git a/sijapi/helpers/calendar/updateCal b/sijapi/helpers/calendar/updateCal new file mode 100755 index 0000000..42ef363 --- /dev/null +++ b/sijapi/helpers/calendar/updateCal @@ -0,0 +1,2 @@ +#!/bin/bash +osascript /Users/sij/workshop/sijapi/helpers/updateCal.scpt diff --git a/sijapi/helpers/calendar/updateCal.scpt b/sijapi/helpers/calendar/updateCal.scpt new file mode 100755 index 0000000..401b465 Binary files /dev/null and b/sijapi/helpers/calendar/updateCal.scpt differ diff --git a/sijapi/helpers/calendar/updateCal2.scpt b/sijapi/helpers/calendar/updateCal2.scpt new file mode 100644 index 0000000..6531a6b Binary files /dev/null and b/sijapi/helpers/calendar/updateCal2.scpt differ diff --git a/sijapi/helpers/courtlistener/clHooks.py b/sijapi/helpers/courtlistener/clHooks.py new file mode 100644 index 0000000..683f8aa --- /dev/null +++ b/sijapi/helpers/courtlistener/clHooks.py @@ -0,0 +1,195 @@ +from fastapi import FastAPI, Request, BackgroundTasks, HTTPException, status +from fastapi.responses import JSONResponse +import httpx +import json +from pathlib import Path +import asyncio +from datetime import datetime +import os, io +from PyPDF2 import PdfReader +import aiohttp + +hook = FastAPI() + + +# /Users/sij/Library/CloudStorage/OneDrive-WELC/Documents - WELC-Docket +SYNC_FOLDER = Path(__file__).resolve().parent.parent +HOME_FOLDER = Path.home() +DOCKETS_FOLDER = HOME_FOLDER / "Dockets" +SEARCH_FOLDER = HOME_FOLDER / "Watched Cases" +SCRIPTS_FOLDER = SYNC_FOLDER / ".scripts" +REQUESTS_FOLDER = HOME_FOLDER / "sync" / "requests" +COURTLISTENER_BASE_URL = "https://www.courtlistener.com" +COURTLISTENER_DOCKETS_URL = "https://www.courtlistener.com/api/rest/v3/dockets/" +COURTLISTENER_API_KEY = "efb5fe00f3c6c88d65a32541260945befdf53a7e" + +with open(SCRIPTS_FOLDER / 'caseTable.json', 'r') as file: + CASE_TABLE = json.load(file) + +@hook.get("/health") +async def health(): + return {"status": "ok"} + +@hook.post("/cl/docket") +async def respond(request: Request, background_tasks: BackgroundTasks): + client_ip = request.client.host + logging.info(f"Received request from IP: {client_ip}") + data = await request.json() + payload = data['payload'] + results = data['payload']['results'] + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + payload_file = REQUESTS_FOLDER / f"{timestamp}-{client_ip}_docket.json" + with open(payload_file, 'w') as file: + json.dump(payload, file, indent=2) + + for result in results: + background_tasks.add_task(process_docket, result) + return JSONResponse(content={"message": "Received"}, status_code=status.HTTP_200_OK) + +async def process_docket(result): + async with httpx.AsyncClient() as session: + await process_docket_result(result, session) + + +async def process_docket_result(result, session): + docket = str(result.get('docket')) + case_code, case_shortname = get_case_details(docket) + date_filed = result.get('date_filed', 'No Date Filed') + + try: + date_filed_formatted = datetime.strptime(date_filed, '%Y-%m-%d').strftime('%Y%m%d') + except ValueError: + date_filed_formatted = 'NoDateFiled' + + # Fetching court docket information from the API + url = f"{COURTLISTENER_DOCKETS_URL}?id={docket}" + headers = {'Authorization': f'Token {COURTLISTENER_API_KEY}'} + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=headers) as response: + if response.status == 200: + logging.info(f"Fetching CourtListener docket information for {docket}...") + data = await response.json() + court_docket = data['results'][0]['docket_number_core'] + court_docket = f"{court_docket[:2]}-cv-{court_docket[2:]}" # Formatting the docket number + case_name = data['results'][0]['case_name'] + logging.info(f"Obtained from CourtListener: docket {court_docket}, case name {case_name}.") + else: + logging.info("Failed to fetch data from CourtListener API.") + court_docket = 'NoCourtDocket' + case_name = 'NoCaseName' + + for document in result.get('recap_documents', []): + filepath_ia = document.get('filepath_ia') + filepath_local = document.get('filepath_local') + + if filepath_ia: + file_url = filepath_ia + logging.info(f"Found IA file at {file_url}.") + elif filepath_local: + file_url = f"{COURTLISTENER_BASE_URL}/{filepath_local}" + logging.info(f"Found local file at {file_url}.") + else: + logging.info(f"No file URL found in filepath_ia or filepath_local for one of the documents.") + continue + + document_number = document.get('document_number', 'NoDocumentNumber') + description = document.get('description', 'NoDescription').replace(" ", "_").replace("/", "_") + description = description[:50] # Truncate description + # case_shortname = case_name # TEMPORARY OVERRIDE + file_name = f"{case_code}_{document_number}_{date_filed_formatted}_{description}.pdf" + target_path = Path(DOCKETS_FOLDER) / case_shortname / "Docket" / file_name + target_path.parent.mkdir(parents=True, exist_ok=True) + await download_file(file_url, target_path, session) + logging.info(f"Downloaded {file_name} to {target_path}") + + +def get_case_details(docket): + case_info = CASE_TABLE.get(str(docket), {"code": "000", "shortname": "UNKNOWN"}) + case_code = case_info.get("code") + short_name = case_info.get("shortname") + return case_code, short_name + + + +async def download_file(url: str, path: Path, session: aiohttp.ClientSession = None): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + } + async with aiohttp.ClientSession() as session: + logging.info(f"Attempting to download {url} to {path}.") + try: + async with session.get(url, headers=headers, allow_redirects=True) as response: + if response.status == 403: + logging.error(f"Access denied (403 Forbidden) for URL: {url}. Skipping download.") + return + response.raise_for_status() + + # Check if the response content type is a PDF + content_type = response.headers.get('Content-Type') + if content_type != 'application/pdf': + logging.error(f"Invalid content type: {content_type}. Skipping download.") + return + + # Create an in-memory buffer to store the downloaded content + buffer = io.BytesIO() + async for chunk in response.content.iter_chunked(1024): + buffer.write(chunk) + + # Reset the buffer position to the beginning + buffer.seek(0) + + # Validate the downloaded PDF content + try: + PdfReader(buffer) + except Exception as e: + logging.error(f"Invalid PDF content: {str(e)}. Skipping download.") + return + + # If the PDF is valid, write the content to the file on disk + path.parent.mkdir(parents=True, exist_ok=True) + with path.open('wb') as file: + file.write(buffer.getvalue()) + + except Exception as e: + logging.error(f"Error downloading file: {str(e)}") + +@hook.post("/cl/search") +async def respond_search(request: Request, background_tasks: BackgroundTasks): + client_ip = request.client.host + logging.info(f"Received request from IP: {client_ip}") + data = await request.json() + payload = data['payload'] + results = data['payload']['results'] + + # Save the payload data + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + payload_file = REQUESTS_FOLDER / f"{timestamp}-{client_ip}_search.json" + with open(payload_file, 'w') as file: + json.dump(payload, file, indent=2) + + for result in results: + background_tasks.add_task(process_search_result, result) + return JSONResponse(content={"message": "Received"}, status_code=status.HTTP_200_OK) + + +async def process_search_result(result): + async with httpx.AsyncClient() as session: + download_url = result.get('download_url') + court_id = result.get('court_id') + case_name_short = result.get('caseNameShort') + case_name = result.get('caseName') + logging.info(f"Received payload for case {case_name} ({court_id}) and download url {download_url}") + + court_folder = court_id + + if case_name_short: + case_folder = case_name_short + else: + case_folder = case_name + + file_name = download_url.split('/')[-1] + target_path = Path(SEARCH_FOLDER) / court_folder / case_folder / file_name + target_path.parent.mkdir(parents=True, exist_ok=True) + + await download_file(download_url, target_path, session) + logging.info(f"Downloaded {file_name} to {target_path}") \ No newline at end of file diff --git a/sijapi/helpers/courtlistener/subscribeAlerts.py b/sijapi/helpers/courtlistener/subscribeAlerts.py new file mode 100644 index 0000000..62a2017 --- /dev/null +++ b/sijapi/helpers/courtlistener/subscribeAlerts.py @@ -0,0 +1,32 @@ +import json +import requests + +# Load the caseTable.json file +with open('caseTable.json', 'r') as file: + case_table = json.load(file) + +# Set the base URL and authorization token +base_url = "https://www.courtlistener.com/api/rest/v3/docket-alerts/" +auth_token = "a90d3f2de489aa4138a32133ca8bfec9d85fecfa" + +# Iterate through each key (docket ID) in the case table +for docket_id in case_table.keys(): + # Set the data payload and headers for the request + data = {'docket': docket_id} + headers = {'Authorization': f'Token {auth_token}'} + + try: + # Send the POST request to the CourtListener API + response = requests.post(base_url, data=data, headers=headers) + + # Check the response status code + if response.status_code == 200: + print(f"Successfully created docket alert for docket ID: {docket_id}") + else: + print(f"Failed to create docket alert for docket ID: {docket_id}") + print(f"Status code: {response.status_code}") + print(f"Response content: {response.content}") + + except requests.exceptions.RequestException as e: + print(f"Error occurred while creating docket alert for docket ID: {docket_id}") + print(f"Error message: {str(e)}") diff --git a/sijapi/helpers/database/dbrestore.sh b/sijapi/helpers/database/dbrestore.sh new file mode 100755 index 0000000..273410e --- /dev/null +++ b/sijapi/helpers/database/dbrestore.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +DB_NAME="weatherlocate.db" + +# Step 1: Backup existing data +echo "Backing up existing data..." +sqlite3 $DB_NAME < ... + +# Update complication on device from which this function was installed with a number of content parameters that can be string, progress, icon, target or color. + +# Each argument type is derived from input. + +# Progress has the form: 50% or 110/220 + +# Icon must match valid SF Symbol name such as globe or terminal.fill + +# Colors must be hex colours such as #000 #ff00ff where the color is used for later content and 'foreground' switches back to default colour + +# Target is used to send different content to different complications after configuring the complications with different target identifiers which requires the pro unlock. The target parameter is never assumed unless --target is used and is effective until next --target parameter allowing updates of several complications with a single command + +# You can configure complications to only show content for a given target. + +# String is the fallback type if nothing else matches, but content type can be forced for next parameter with --progress, --icon, --color, --text or --target with +# something like: + +widget --text "50/100" + +# You can update several complications at once by using --target to send all parameters until the next --target to a particular complication. Updating several complications at once allows more total updates per day. + +# EOF +# return 0 +# fi + +# local key=d7e810e7601cd296a05776c169b4fe97a6a5ee1fd46abe38de54f415732b3f4b +# local user=WuqPwm1VpGijF4U5AnIKzqNMVWGioANTRjJoonPm +# local iv=ab5bbeb426015da7eedcee8bee3dffb7 + +# local plain=$( +# echo Secure ShellFish Widget 2.0 +# for var in "$@" +# do +# echo -ne "$var" | base64 +# done) +# local base64=$(echo "$plain" | openssl enc -aes-256-cbc -base64 -K $key -iv $iv) +# curl -sS -X POST -H "Content-Type: text/plain" --data "$base64" "https://secureshellfish.app/push/?user=$user" diff --git a/sijapi/helpers/obsidian/month_o_banners.sh b/sijapi/helpers/obsidian/month_o_banners.sh new file mode 100755 index 0000000..e06385d --- /dev/null +++ b/sijapi/helpers/obsidian/month_o_banners.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Iterate from 18 to 30 +for i in $(seq -w 01 30); do + # Construct the date string + DATE="2024-06-${i}" + + # Print the date being processed (optional) + echo "Processing date: $DATE" + + # Run the curl command + curl -X POST -H "Content-Type: application/json" -d '{"mood": "joyful"}' "http://localhost:4444/note/banner?dt=$DATE" + + # Wait for the curl command to finish before starting the next iteration + wait +done + diff --git a/sijapi/helpers/scrapers/Readability.js b/sijapi/helpers/scrapers/Readability.js new file mode 100644 index 0000000..89fdbd0 --- /dev/null +++ b/sijapi/helpers/scrapers/Readability.js @@ -0,0 +1,2373 @@ +/* + * Copyright (c) 2010 Arc90 Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This code is heavily based on Arc90's readability.js (1.7.1) script + * available at: http://code.google.com/p/arc90labs-readability + */ + +/** + * Public constructor. + * @param {HTMLDocument} doc The document to parse. + * @param {Object} options The options object. + */ +function Readability(doc, options) { + // In some older versions, people passed a URI as the first argument. Cope: + if (options && options.documentElement) { + doc = options; + options = arguments[2]; + } else if (!doc || !doc.documentElement) { + throw new Error("First argument to Readability constructor should be a document object."); + } + options = options || {}; + + this._doc = doc; + this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; + this._articleTitle = null; + this._articleByline = null; + this._articleDir = null; + this._articleSiteName = null; + this._attempts = []; + + // Configurable options + this._debug = !!options.debug; + this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; + this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; + this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; + this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); + this._keepClasses = !!options.keepClasses; + this._serializer = options.serializer || function(el) { + return el.innerHTML; + }; + this._disableJSONLD = !!options.disableJSONLD; + this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos; + + // Start with all flags set + this._flags = this.FLAG_STRIP_UNLIKELYS | + this.FLAG_WEIGHT_CLASSES | + this.FLAG_CLEAN_CONDITIONALLY; + + + // Control whether log messages are sent to the console + if (this._debug) { + let logNode = function(node) { + if (node.nodeType == node.TEXT_NODE) { + return `${node.nodeName} ("${node.textContent}")`; + } + let attrPairs = Array.from(node.attributes || [], function(attr) { + return `${attr.name}="${attr.value}"`; + }).join(" "); + return `<${node.localName} ${attrPairs}>`; + }; + this.log = function () { + if (typeof console !== "undefined") { + let args = Array.from(arguments, arg => { + if (arg && arg.nodeType == this.ELEMENT_NODE) { + return logNode(arg); + } + return arg; + }); + args.unshift("Reader: (Readability)"); + console.log.apply(console, args); + } else if (typeof dump !== "undefined") { + /* global dump */ + var msg = Array.prototype.map.call(arguments, function(x) { + return (x && x.nodeName) ? logNode(x) : x; + }).join(" "); + dump("Reader: (Readability) " + msg + "\n"); + } + }; + } else { + this.log = function () {}; + } +} + +Readability.prototype = { + FLAG_STRIP_UNLIKELYS: 0x1, + FLAG_WEIGHT_CLASSES: 0x2, + FLAG_CLEAN_CONDITIONALLY: 0x4, + + // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType + ELEMENT_NODE: 1, + TEXT_NODE: 3, + + // Max number of nodes supported by this parser. Default: 0 (no limit) + DEFAULT_MAX_ELEMS_TO_PARSE: 0, + + // The number of top candidates to consider when analysing how + // tight the competition is among candidates. + DEFAULT_N_TOP_CANDIDATES: 5, + + // Element tags to score by default. + DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), + + // The default number of chars an article must have in order to return a result + DEFAULT_CHAR_THRESHOLD: 500, + + // All of the regular expressions in use within readability. + // Defined up here so we don't instantiate them repeatedly in loops. + REGEXPS: { + // NOTE: These two regular expressions are duplicated in + // Readability-readerable.js. Please keep both copies in sync. + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, + + positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, + negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, + byline: /byline|author|dateline|writtenby|p-author/i, + replaceFonts: /<(\/?)font[^>]*>/gi, + normalize: /\s{2,}/g, + videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, + shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, + nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, + prevLink: /(prev|earl|old|new|<|«)/i, + tokenize: /\W+/g, + whitespace: /^\s*$/, + hasContent: /\S$/, + hashUrl: /^#.+/, + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, + // Commas as used in Latin, Sindhi, Chinese and various other scripts. + // see: https://en.wikipedia.org/wiki/Comma#Comma_variants + commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g, + // See: https://schema.org/Article + jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/, + // used to see if a node's content matches words commonly used for ad blocks or loading indicators + adWords: /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu, + loadingWords: /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu, + }, + + UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], + + DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), + + ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], + + PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ], + + DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], + + // The commented out elements qualify as phrasing content but tend to be + // removed by readability when put into paragraphs, so we ignore them here. + PHRASING_ELEMS: [ + // "CANVAS", "IFRAME", "SVG", "VIDEO", + "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", + "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", + "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", + "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", + "SUP", "TEXTAREA", "TIME", "VAR", "WBR", + ], + + // These are the classes that readability sets itself. + CLASSES_TO_PRESERVE: [ "page" ], + + // These are the list of HTML entities that need to be escaped. + HTML_ESCAPE_MAP: { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }, + + /** + * Run any post-process modifications to article content as necessary. + * + * @param Element + * @return void + **/ + _postProcessContent: function(articleContent) { + // Readability cannot open relative uris so we convert them to absolute uris. + this._fixRelativeUris(articleContent); + + this._simplifyNestedElements(articleContent); + + if (!this._keepClasses) { + // Remove classes. + this._cleanClasses(articleContent); + } + }, + + /** + * Iterates over a NodeList, calls `filterFn` for each node and removes node + * if function returned `true`. + * + * If function is not passed, removes all the nodes in node list. + * + * @param NodeList nodeList The nodes to operate on + * @param Function filterFn the function to use as a filter + * @return void + */ + _removeNodes: function(nodeList, filterFn) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _removeNodes"); + } + for (var i = nodeList.length - 1; i >= 0; i--) { + var node = nodeList[i]; + var parentNode = node.parentNode; + if (parentNode) { + if (!filterFn || filterFn.call(this, node, i, nodeList)) { + parentNode.removeChild(node); + } + } + } + }, + + /** + * Iterates over a NodeList, and calls _setNodeTag for each node. + * + * @param NodeList nodeList The nodes to operate on + * @param String newTagName the new tag name to use + * @return void + */ + _replaceNodeTags: function(nodeList, newTagName) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _replaceNodeTags"); + } + for (const node of nodeList) { + this._setNodeTag(node, newTagName); + } + }, + + /** + * Iterate over a NodeList, which doesn't natively fully implement the Array + * interface. + * + * For convenience, the current object context is applied to the provided + * iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return void + */ + _forEachNode: function(nodeList, fn) { + Array.prototype.forEach.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, and return the first node that passes + * the supplied test function + * + * For convenience, the current object context is applied to the provided + * test function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The test function. + * @return void + */ + _findNode: function(nodeList, fn) { + return Array.prototype.find.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, return true if any of the provided iterate + * function calls returns true, false otherwise. + * + * For convenience, the current object context is applied to the + * provided iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return Boolean + */ + _someNode: function(nodeList, fn) { + return Array.prototype.some.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, return true if all of the provided iterate + * function calls return true, false otherwise. + * + * For convenience, the current object context is applied to the + * provided iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return Boolean + */ + _everyNode: function(nodeList, fn) { + return Array.prototype.every.call(nodeList, fn, this); + }, + + /** + * Concat all nodelists passed as arguments. + * + * @return ...NodeList + * @return Array + */ + _concatNodeLists: function() { + var slice = Array.prototype.slice; + var args = slice.call(arguments); + var nodeLists = args.map(function(list) { + return slice.call(list); + }); + return Array.prototype.concat.apply([], nodeLists); + }, + + _getAllNodesWithTag: function(node, tagNames) { + if (node.querySelectorAll) { + return node.querySelectorAll(tagNames.join(",")); + } + return [].concat.apply([], tagNames.map(function(tag) { + var collection = node.getElementsByTagName(tag); + return Array.isArray(collection) ? collection : Array.from(collection); + })); + }, + + /** + * Removes the class="" attribute from every element in the given + * subtree, except those that match CLASSES_TO_PRESERVE and + * the classesToPreserve array from the options object. + * + * @param Element + * @return void + */ + _cleanClasses: function(node) { + var classesToPreserve = this._classesToPreserve; + var className = (node.getAttribute("class") || "") + .split(/\s+/) + .filter(function(cls) { + return classesToPreserve.indexOf(cls) != -1; + }) + .join(" "); + + if (className) { + node.setAttribute("class", className); + } else { + node.removeAttribute("class"); + } + + for (node = node.firstElementChild; node; node = node.nextElementSibling) { + this._cleanClasses(node); + } + }, + + /** + * Converts each and uri in the given element to an absolute URI, + * ignoring #ref URIs. + * + * @param Element + * @return void + */ + _fixRelativeUris: function(articleContent) { + var baseURI = this._doc.baseURI; + var documentURI = this._doc.documentURI; + function toAbsoluteURI(uri) { + // Leave hash links alone if the base URI matches the document URI: + if (baseURI == documentURI && uri.charAt(0) == "#") { + return uri; + } + + // Otherwise, resolve against base URI: + try { + return new URL(uri, baseURI).href; + } catch (ex) { + // Something went wrong, just return the original: + } + return uri; + } + + var links = this._getAllNodesWithTag(articleContent, ["a"]); + this._forEachNode(links, function(link) { + var href = link.getAttribute("href"); + if (href) { + // Remove links with javascript: URIs, since + // they won't work after scripts have been removed from the page. + if (href.indexOf("javascript:") === 0) { + // if the link only contains simple text content, it can be converted to a text node + if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { + var text = this._doc.createTextNode(link.textContent); + link.parentNode.replaceChild(text, link); + } else { + // if the link has multiple children, they should all be preserved + var container = this._doc.createElement("span"); + while (link.firstChild) { + container.appendChild(link.firstChild); + } + link.parentNode.replaceChild(container, link); + } + } else { + link.setAttribute("href", toAbsoluteURI(href)); + } + } + }); + + var medias = this._getAllNodesWithTag(articleContent, [ + "img", "picture", "figure", "video", "audio", "source", + ]); + + this._forEachNode(medias, function(media) { + var src = media.getAttribute("src"); + var poster = media.getAttribute("poster"); + var srcset = media.getAttribute("srcset"); + + if (src) { + media.setAttribute("src", toAbsoluteURI(src)); + } + + if (poster) { + media.setAttribute("poster", toAbsoluteURI(poster)); + } + + if (srcset) { + var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + }); + + media.setAttribute("srcset", newSrcset); + } + }); + }, + + _simplifyNestedElements: function(articleContent) { + var node = articleContent; + + while (node) { + if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { + if (this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { + var child = node.children[0]; + for (var i = 0; i < node.attributes.length; i++) { + child.setAttribute(node.attributes[i].name, node.attributes[i].value); + } + node.parentNode.replaceChild(child, node); + node = child; + continue; + } + } + + node = this._getNextNode(node); + } + }, + + /** + * Get the article title as an H1. + * + * @return string + **/ + _getArticleTitle: function() { + var doc = this._doc; + var curTitle = ""; + var origTitle = ""; + + try { + curTitle = origTitle = doc.title.trim(); + + // If they had an element with id "title" in their HTML + if (typeof curTitle !== "string") + curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); + } catch (e) {/* ignore exceptions setting the title. */} + + var titleHadHierarchicalSeparators = false; + function wordCount(str) { + return str.split(/\s+/).length; + } + + // If there's a separator in the title, first remove the final part + if ((/ [\|\-\\\/>»] /).test(curTitle)) { + titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); + curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); + + // If the resulting title is too short (3 words or fewer), remove + // the first part instead: + if (wordCount(curTitle) < 3) + curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1"); + } else if (curTitle.indexOf(": ") !== -1) { + // Check if we have an heading containing this exact string, so we + // could assume it's the full title. + var headings = this._concatNodeLists( + doc.getElementsByTagName("h1"), + doc.getElementsByTagName("h2") + ); + var trimmedTitle = curTitle.trim(); + var match = this._someNode(headings, function(heading) { + return heading.textContent.trim() === trimmedTitle; + }); + + // If we don't, let's extract the title out of the original title string. + if (!match) { + curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); + + // If the title is now too short, try the first colon instead: + if (wordCount(curTitle) < 3) { + curTitle = origTitle.substring(origTitle.indexOf(":") + 1); + // But if we have too many words before the colon there's something weird + // with the titles and the H tags so let's just use the original title instead + } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { + curTitle = origTitle; + } + } + } else if (curTitle.length > 150 || curTitle.length < 15) { + var hOnes = doc.getElementsByTagName("h1"); + + if (hOnes.length === 1) + curTitle = this._getInnerText(hOnes[0]); + } + + curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); + // If we now have 4 words or fewer as our title, and either no + // 'hierarchical' separators (\, /, > or ») were found in the original + // title or we decreased the number of words by more than 1 word, use + // the original title. + var curTitleWordCount = wordCount(curTitle); + if (curTitleWordCount <= 4 && + (!titleHadHierarchicalSeparators || + curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { + curTitle = origTitle; + } + + return curTitle; + }, + + /** + * Prepare the HTML document for readability to scrape it. + * This includes things like stripping javascript, CSS, and handling terrible markup. + * + * @return void + **/ + _prepDocument: function() { + var doc = this._doc; + + // Remove all style tags in head + this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); + + if (doc.body) { + this._replaceBrs(doc.body); + } + + this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); + }, + + /** + * Finds the next node, starting from the given node, and ignoring + * whitespace in between. If the given node is an element, the same node is + * returned. + */ + _nextNode: function (node) { + var next = node; + while (next + && (next.nodeType != this.ELEMENT_NODE) + && this.REGEXPS.whitespace.test(next.textContent)) { + next = next.nextSibling; + } + return next; + }, + + /** + * Replaces 2 or more successive
elements with a single

. + * Whitespace between
elements are ignored. For example: + *

foo
bar


abc
+ * will become: + *
foo
bar

abc

+ */ + _replaceBrs: function (elem) { + this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) { + var next = br.nextSibling; + + // Whether 2 or more
elements have been found and replaced with a + //

block. + var replaced = false; + + // If we find a
chain, remove the
s until we hit another node + // or non-whitespace. This leaves behind the first
in the chain + // (which will be replaced with a

later). + while ((next = this._nextNode(next)) && (next.tagName == "BR")) { + replaced = true; + var brSibling = next.nextSibling; + next.parentNode.removeChild(next); + next = brSibling; + } + + // If we removed a
chain, replace the remaining
with a

. Add + // all sibling nodes as children of the

until we hit another
+ // chain. + if (replaced) { + var p = this._doc.createElement("p"); + br.parentNode.replaceChild(p, br); + + next = p.nextSibling; + while (next) { + // If we've hit another

, we're done adding children to this

. + if (next.tagName == "BR") { + var nextElem = this._nextNode(next.nextSibling); + if (nextElem && nextElem.tagName == "BR") + break; + } + + if (!this._isPhrasingContent(next)) + break; + + // Otherwise, make this node a child of the new

. + var sibling = next.nextSibling; + p.appendChild(next); + next = sibling; + } + + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } + + if (p.parentNode.tagName === "P") + this._setNodeTag(p.parentNode, "DIV"); + } + }); + }, + + _setNodeTag: function (node, tag) { + this.log("_setNodeTag", node, tag); + if (this._docJSDOMParser) { + node.localName = tag.toLowerCase(); + node.tagName = tag.toUpperCase(); + return node; + } + + var replacement = node.ownerDocument.createElement(tag); + while (node.firstChild) { + replacement.appendChild(node.firstChild); + } + node.parentNode.replaceChild(replacement, node); + if (node.readability) + replacement.readability = node.readability; + + for (var i = 0; i < node.attributes.length; i++) { + try { + replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + } catch (ex) { + /* it's possible for setAttribute() to throw if the attribute name + * isn't a valid XML Name. Such attributes can however be parsed from + * source in HTML docs, see https://github.com/whatwg/html/issues/4275, + * so we can hit them here and then throw. We don't care about such + * attributes so we ignore them. + */ + } + } + return replacement; + }, + + /** + * Prepare the article node for display. Clean out any inline styles, + * iframes, forms, strip extraneous

tags, etc. + * + * @param Element + * @return void + **/ + _prepArticle: function(articleContent) { + this._cleanStyles(articleContent); + + // Check for data tables before we continue, to avoid removing items in + // those tables, which will often be isolated even though they're + // visually linked to other content-ful elements (text, images, etc.). + this._markDataTables(articleContent); + + this._fixLazyImages(articleContent); + + // Clean out junk from the article content + this._cleanConditionally(articleContent, "form"); + this._cleanConditionally(articleContent, "fieldset"); + this._clean(articleContent, "object"); + this._clean(articleContent, "embed"); + this._clean(articleContent, "footer"); + this._clean(articleContent, "link"); + this._clean(articleContent, "aside"); + + // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, + // which means we don't remove the top candidates even they have "share". + + var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; + + this._forEachNode(articleContent.children, function (topCandidate) { + this._cleanMatchedNodes(topCandidate, function (node, matchString) { + return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; + }); + }); + + this._clean(articleContent, "iframe"); + this._clean(articleContent, "input"); + this._clean(articleContent, "textarea"); + this._clean(articleContent, "select"); + this._clean(articleContent, "button"); + this._cleanHeaders(articleContent); + + // Do these last as the previous stuff may have removed junk + // that will affect these + this._cleanConditionally(articleContent, "table"); + this._cleanConditionally(articleContent, "ul"); + this._cleanConditionally(articleContent, "div"); + + // replace H1 with H2 as H1 should be only title that is displayed separately + this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); + + // Remove extra paragraphs + this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { + var imgCount = paragraph.getElementsByTagName("img").length; + var embedCount = paragraph.getElementsByTagName("embed").length; + var objectCount = paragraph.getElementsByTagName("object").length; + // At this point, nasty iframes have been removed, only remain embedded video ones. + var iframeCount = paragraph.getElementsByTagName("iframe").length; + var totalCount = imgCount + embedCount + objectCount + iframeCount; + + return totalCount === 0 && !this._getInnerText(paragraph, false); + }); + + this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { + var next = this._nextNode(br.nextSibling); + if (next && next.tagName == "P") + br.parentNode.removeChild(br); + }); + + // Remove single-cell tables + this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { + var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; + if (this._hasSingleTagInsideElement(tbody, "TR")) { + var row = tbody.firstElementChild; + if (this._hasSingleTagInsideElement(row, "TD")) { + var cell = row.firstElementChild; + cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); + table.parentNode.replaceChild(cell, table); + } + } + }); + }, + + /** + * Initialize a node with the readability object. Also checks the + * className/id for special names to add to its score. + * + * @param Element + * @return void + **/ + _initializeNode: function(node) { + node.readability = {"contentScore": 0}; + + switch (node.tagName) { + case "DIV": + node.readability.contentScore += 5; + break; + + case "PRE": + case "TD": + case "BLOCKQUOTE": + node.readability.contentScore += 3; + break; + + case "ADDRESS": + case "OL": + case "UL": + case "DL": + case "DD": + case "DT": + case "LI": + case "FORM": + node.readability.contentScore -= 3; + break; + + case "H1": + case "H2": + case "H3": + case "H4": + case "H5": + case "H6": + case "TH": + node.readability.contentScore -= 5; + break; + } + + node.readability.contentScore += this._getClassWeight(node); + }, + + _removeAndGetNext: function(node) { + var nextNode = this._getNextNode(node, true); + node.parentNode.removeChild(node); + return nextNode; + }, + + /** + * Traverse the DOM from node to node, starting at the node passed in. + * Pass true for the second parameter to indicate this node itself + * (and its kids) are going away, and we want the next node over. + * + * Calling this in a loop will traverse the DOM depth-first. + */ + _getNextNode: function(node, ignoreSelfAndKids) { + // First check for kids if those aren't being ignored + if (!ignoreSelfAndKids && node.firstElementChild) { + return node.firstElementChild; + } + // Then for siblings... + if (node.nextElementSibling) { + return node.nextElementSibling; + } + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + do { + node = node.parentNode; + } while (node && !node.nextElementSibling); + return node && node.nextElementSibling; + }, + + // compares second text to first one + // 1 = same text, 0 = completely different text + // works the way that it splits both texts into words and then finds words that are unique in second text + // the result is given by the lower length of unique parts + _textSimilarity: function(textA, textB) { + var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + if (!tokensA.length || !tokensB.length) { + return 0; + } + var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); + var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; + return 1 - distanceB; + }, + + _checkByline: function(node, matchString) { + if (this._articleByline) { + return false; + } + + if (node.getAttribute !== undefined) { + var rel = node.getAttribute("rel"); + var itemprop = node.getAttribute("itemprop"); + } + + if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { + this._articleByline = node.textContent.trim(); + return true; + } + + return false; + }, + + _getNodeAncestors: function(node, maxDepth) { + maxDepth = maxDepth || 0; + var i = 0, ancestors = []; + while (node.parentNode) { + ancestors.push(node.parentNode); + if (maxDepth && ++i === maxDepth) + break; + node = node.parentNode; + } + return ancestors; + }, + + /*** + * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is + * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. + * + * @param page a document to run upon. Needs to be a full document, complete with body. + * @return Element + **/ + _grabArticle: function (page) { + this.log("**** grabArticle ****"); + var doc = this._doc; + var isPaging = page !== null; + page = page ? page : this._doc.body; + + // We can't grab an article if we don't have a page! + if (!page) { + this.log("No body found in document. Abort."); + return null; + } + + var pageCacheHtml = page.innerHTML; + + while (true) { + this.log("Starting grabArticle loop"); + var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); + + // First, node prepping. Trash nodes that look cruddy (like ones with the + // class name "comment", etc), and turn divs into P tags where they have been + // used inappropriately (as in, where they contain no other block level elements.) + var elementsToScore = []; + var node = this._doc.documentElement; + + let shouldRemoveTitleHeader = true; + + while (node) { + + if (node.tagName === "HTML") { + this._articleLang = node.getAttribute("lang"); + } + + var matchString = node.className + " " + node.id; + + if (!this._isProbablyVisible(node)) { + this.log("Removing hidden node - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + + // User is not able to see elements applied with both "aria-modal = true" and "role = dialog" + if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") { + node = this._removeAndGetNext(node); + continue; + } + + // Check to see if this node is a byline, and remove it if it is. + if (this._checkByline(node, matchString)) { + node = this._removeAndGetNext(node); + continue; + } + + if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { + this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); + shouldRemoveTitleHeader = false; + node = this._removeAndGetNext(node); + continue; + } + + // Remove unlikely candidates + if (stripUnlikelyCandidates) { + if (this.REGEXPS.unlikelyCandidates.test(matchString) && + !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + !this._hasAncestorTag(node, "table") && + !this._hasAncestorTag(node, "code") && + node.tagName !== "BODY" && + node.tagName !== "A") { + this.log("Removing unlikely candidate - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + + if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { + this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + } + + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). + if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || + node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || + node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && + this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } + + if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { + elementsToScore.push(node); + } + + // Turn all divs that don't have children block level elements into p's + if (node.tagName === "DIV") { + // Put phrasing content into paragraphs. + var p = null; + var childNode = node.firstChild; + while (childNode) { + var nextSibling = childNode.nextSibling; + if (this._isPhrasingContent(childNode)) { + if (p !== null) { + p.appendChild(childNode); + } else if (!this._isWhitespace(childNode)) { + p = doc.createElement("p"); + node.replaceChild(p, childNode); + p.appendChild(childNode); + } + } else if (p !== null) { + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } + p = null; + } + childNode = nextSibling; + } + + // Sites like http://mobile.slate.com encloses each paragraph with a DIV + // element. DIVs with only a P element inside and no text content can be + // safely converted into plain P elements to avoid confusing the scoring + // algorithm with DIVs with are, in practice, paragraphs. + if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { + var newNode = node.children[0]; + node.parentNode.replaceChild(newNode, node); + node = newNode; + elementsToScore.push(node); + } else if (!this._hasChildBlockElement(node)) { + node = this._setNodeTag(node, "P"); + elementsToScore.push(node); + } + } + node = this._getNextNode(node); + } + + /** + * Loop through all paragraphs, and assign a score to them based on how content-y they look. + * Then add their score to their parent node. + * + * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. + **/ + var candidates = []; + this._forEachNode(elementsToScore, function(elementToScore) { + if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") + return; + + // If this paragraph is less than 25 characters, don't even count it. + var innerText = this._getInnerText(elementToScore); + if (innerText.length < 25) + return; + + // Exclude nodes with no ancestor. + var ancestors = this._getNodeAncestors(elementToScore, 5); + if (ancestors.length === 0) + return; + + var contentScore = 0; + + // Add a point for the paragraph itself as a base. + contentScore += 1; + + // Add points for any commas within this paragraph. + contentScore += innerText.split(this.REGEXPS.commas).length; + + // For every 100 characters in this paragraph, add another point. Up to 3 points. + contentScore += Math.min(Math.floor(innerText.length / 100), 3); + + // Initialize and score ancestors. + this._forEachNode(ancestors, function(ancestor, level) { + if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") + return; + + if (typeof(ancestor.readability) === "undefined") { + this._initializeNode(ancestor); + candidates.push(ancestor); + } + + // Node score divider: + // - parent: 1 (no division) + // - grandparent: 2 + // - great grandparent+: ancestor level * 3 + if (level === 0) + var scoreDivider = 1; + else if (level === 1) + scoreDivider = 2; + else + scoreDivider = level * 3; + ancestor.readability.contentScore += contentScore / scoreDivider; + }); + }); + + // After we've calculated scores, loop through all of the possible + // candidate nodes we found and find the one with the highest score. + var topCandidates = []; + for (var c = 0, cl = candidates.length; c < cl; c += 1) { + var candidate = candidates[c]; + + // Scale the final candidates score based on link density. Good content + // should have a relatively small link density (5% or less) and be mostly + // unaffected by this operation. + var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); + candidate.readability.contentScore = candidateScore; + + this.log("Candidate:", candidate, "with score " + candidateScore); + + for (var t = 0; t < this._nbTopCandidates; t++) { + var aTopCandidate = topCandidates[t]; + + if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { + topCandidates.splice(t, 0, candidate); + if (topCandidates.length > this._nbTopCandidates) + topCandidates.pop(); + break; + } + } + } + + var topCandidate = topCandidates[0] || null; + var neededToCreateTopCandidate = false; + var parentOfTopCandidate; + + // If we still have no top candidate, just use the body as a last resort. + // We also have to copy the body node so it is something we can modify. + if (topCandidate === null || topCandidate.tagName === "BODY") { + // Move all of the page's children into topCandidate + topCandidate = doc.createElement("DIV"); + neededToCreateTopCandidate = true; + // Move everything (not just elements, also text nodes etc.) into the container + // so we even include text directly in the body: + while (page.firstChild) { + this.log("Moving child out:", page.firstChild); + topCandidate.appendChild(page.firstChild); + } + + page.appendChild(topCandidate); + + this._initializeNode(topCandidate); + } else if (topCandidate) { + // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array + // and whose scores are quite closed with current `topCandidate` node. + var alternativeCandidateAncestors = []; + for (var i = 1; i < topCandidates.length; i++) { + if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { + alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); + } + } + var MINIMUM_TOPCANDIDATES = 3; + if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { + parentOfTopCandidate = topCandidate.parentNode; + while (parentOfTopCandidate.tagName !== "BODY") { + var listsContainingThisAncestor = 0; + for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { + listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); + } + if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { + topCandidate = parentOfTopCandidate; + break; + } + parentOfTopCandidate = parentOfTopCandidate.parentNode; + } + } + if (!topCandidate.readability) { + this._initializeNode(topCandidate); + } + + // Because of our bonus system, parents of candidates might have scores + // themselves. They get half of the node. There won't be nodes with higher + // scores than our topCandidate, but if we see the score going *up* in the first + // few steps up the tree, that's a decent sign that there might be more content + // lurking in other places that we want to unify in. The sibling stuff + // below does some of that - but only if we've looked high enough up the DOM + // tree. + parentOfTopCandidate = topCandidate.parentNode; + var lastScore = topCandidate.readability.contentScore; + // The scores shouldn't get too low. + var scoreThreshold = lastScore / 3; + while (parentOfTopCandidate.tagName !== "BODY") { + if (!parentOfTopCandidate.readability) { + parentOfTopCandidate = parentOfTopCandidate.parentNode; + continue; + } + var parentScore = parentOfTopCandidate.readability.contentScore; + if (parentScore < scoreThreshold) + break; + if (parentScore > lastScore) { + // Alright! We found a better parent to use. + topCandidate = parentOfTopCandidate; + break; + } + lastScore = parentOfTopCandidate.readability.contentScore; + parentOfTopCandidate = parentOfTopCandidate.parentNode; + } + + // If the top candidate is the only child, use parent instead. This will help sibling + // joining logic when adjacent content is actually located in parent's sibling node. + parentOfTopCandidate = topCandidate.parentNode; + while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { + topCandidate = parentOfTopCandidate; + parentOfTopCandidate = topCandidate.parentNode; + } + if (!topCandidate.readability) { + this._initializeNode(topCandidate); + } + } + + // Now that we have the top candidate, look through its siblings for content + // that might also be related. Things like preambles, content split by ads + // that we removed, etc. + var articleContent = doc.createElement("DIV"); + if (isPaging) + articleContent.id = "readability-content"; + + var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); + // Keep potential top candidate's parent node to try to get text direction of it later. + parentOfTopCandidate = topCandidate.parentNode; + var siblings = parentOfTopCandidate.children; + + for (var s = 0, sl = siblings.length; s < sl; s++) { + var sibling = siblings[s]; + var append = false; + + this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); + this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); + + if (sibling === topCandidate) { + append = true; + } else { + var contentBonus = 0; + + // Give a bonus if sibling nodes and top candidates have the example same classname + if (sibling.className === topCandidate.className && topCandidate.className !== "") + contentBonus += topCandidate.readability.contentScore * 0.2; + + if (sibling.readability && + ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { + append = true; + } else if (sibling.nodeName === "P") { + var linkDensity = this._getLinkDensity(sibling); + var nodeContent = this._getInnerText(sibling); + var nodeLength = nodeContent.length; + + if (nodeLength > 80 && linkDensity < 0.25) { + append = true; + } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && + nodeContent.search(/\.( |$)/) !== -1) { + append = true; + } + } + } + + if (append) { + this.log("Appending node:", sibling); + + if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { + // We have a node that isn't a common block level element, like a form or td tag. + // Turn it into a div so it doesn't get filtered out later by accident. + this.log("Altering sibling:", sibling, "to div."); + + sibling = this._setNodeTag(sibling, "DIV"); + } + + articleContent.appendChild(sibling); + // Fetch children again to make it compatible + // with DOM parsers without live collection support. + siblings = parentOfTopCandidate.children; + // siblings is a reference to the children array, and + // sibling is removed from the array when we call appendChild(). + // As a result, we must revisit this index since the nodes + // have been shifted. + s -= 1; + sl -= 1; + } + } + + if (this._debug) + this.log("Article content pre-prep: " + articleContent.innerHTML); + // So we have all of the content that we need. Now we clean it up for presentation. + this._prepArticle(articleContent); + if (this._debug) + this.log("Article content post-prep: " + articleContent.innerHTML); + + if (neededToCreateTopCandidate) { + // We already created a fake div thing, and there wouldn't have been any siblings left + // for the previous loop, so there's no point trying to create a new div, and then + // move all the children over. Just assign IDs and class names here. No need to append + // because that already happened anyway. + topCandidate.id = "readability-page-1"; + topCandidate.className = "page"; + } else { + var div = doc.createElement("DIV"); + div.id = "readability-page-1"; + div.className = "page"; + while (articleContent.firstChild) { + div.appendChild(articleContent.firstChild); + } + articleContent.appendChild(div); + } + + if (this._debug) + this.log("Article content after paging: " + articleContent.innerHTML); + + var parseSuccessful = true; + + // Now that we've gone through the full algorithm, check to see if + // we got any meaningful content. If we didn't, we may need to re-run + // grabArticle with different flags set. This gives us a higher likelihood of + // finding the content, and the sieve approach gives us a higher likelihood of + // finding the -right- content. + var textLength = this._getInnerText(articleContent, true).length; + if (textLength < this._charThreshold) { + parseSuccessful = false; + page.innerHTML = pageCacheHtml; + + if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { + this._removeFlag(this.FLAG_STRIP_UNLIKELYS); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { + this._removeFlag(this.FLAG_WEIGHT_CLASSES); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { + this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else { + this._attempts.push({articleContent: articleContent, textLength: textLength}); + // No luck after removing flags, just return the longest text we found during the different loops + this._attempts.sort(function (a, b) { + return b.textLength - a.textLength; + }); + + // But first check if we actually have something + if (!this._attempts[0].textLength) { + return null; + } + + articleContent = this._attempts[0].articleContent; + parseSuccessful = true; + } + } + + if (parseSuccessful) { + // Find out text direction from ancestors of final top candidate. + var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); + this._someNode(ancestors, function(ancestor) { + if (!ancestor.tagName) + return false; + var articleDir = ancestor.getAttribute("dir"); + if (articleDir) { + this._articleDir = articleDir; + return true; + } + return false; + }); + return articleContent; + } + } + }, + + /** + * Check whether the input string could be a byline. + * This verifies that the input is a string, and that the length + * is less than 100 chars. + * + * @param possibleByline {string} - a string to check whether its a byline. + * @return Boolean - whether the input string is a byline. + */ + _isValidByline: function(byline) { + if (typeof byline == "string" || byline instanceof String) { + byline = byline.trim(); + return (byline.length > 0) && (byline.length < 100); + } + return false; + }, + + /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param str {string} - a string to unescape. + * @return string without HTML entity. + */ + _unescapeHtmlEntities: function(str) { + if (!str) { + return str; + } + + var htmlEscapeMap = this.HTML_ESCAPE_MAP; + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { + return htmlEscapeMap[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + }); + }, + + /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * @return Object with any metadata that could be extracted (possibly none) + */ + _getJSONLD: function (doc) { + var scripts = this._getAllNodesWithTag(doc, ["script"]); + + var metadata; + + this._forEachNode(scripts, function(jsonLdElement) { + if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { + try { + // Strip CDATA markers if present + var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); + var parsed = JSON.parse(content); + if ( + !parsed["@context"] || + !parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/) + ) { + return; + } + + if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { + parsed = parsed["@graph"].find(function(it) { + return (it["@type"] || "").match( + this.REGEXPS.jsonLdArticleTypes + ); + }); + } + + if ( + !parsed || + !parsed["@type"] || + !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) + ) { + return; + } + + metadata = {}; + + if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { + // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz + // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either + // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. + + var title = this._getArticleTitle(); + var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; + var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; + + if (headlineMatches && !nameMatches) { + metadata.title = parsed.headline; + } else { + metadata.title = parsed.name; + } + } else if (typeof parsed.name === "string") { + metadata.title = parsed.name.trim(); + } else if (typeof parsed.headline === "string") { + metadata.title = parsed.headline.trim(); + } + if (parsed.author) { + if (typeof parsed.author.name === "string") { + metadata.byline = parsed.author.name.trim(); + } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { + metadata.byline = parsed.author + .filter(function(author) { + return author && typeof author.name === "string"; + }) + .map(function(author) { + return author.name.trim(); + }) + .join(", "); + } + } + if (typeof parsed.description === "string") { + metadata.excerpt = parsed.description.trim(); + } + if ( + parsed.publisher && + typeof parsed.publisher.name === "string" + ) { + metadata.siteName = parsed.publisher.name.trim(); + } + if (typeof parsed.datePublished === "string") { + metadata.datePublished = parsed.datePublished.trim(); + } + return; + } catch (err) { + this.log(err.message); + } + } + }); + return metadata ? metadata : {}; + }, + + /** + * Attempts to get excerpt and byline metadata for the article. + * + * @param {Object} jsonld — object containing any metadata that + * could be extracted from JSON-LD object. + * + * @return Object with optional "excerpt" and "byline" properties + */ + _getArticleMetadata: function(jsonld) { + var metadata = {}; + var values = {}; + var metaElements = this._doc.getElementsByTagName("meta"); + + // property is a space-separated list of values + var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi; + + // name is a single value + var namePattern = /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; + + // Find description tags. + this._forEachNode(metaElements, function(element) { + var elementName = element.getAttribute("name"); + var elementProperty = element.getAttribute("property"); + var content = element.getAttribute("content"); + if (!content) { + return; + } + var matches = null; + var name = null; + + if (elementProperty) { + matches = elementProperty.match(propertyPattern); + if (matches) { + // Convert to lowercase, and remove any whitespace + // so we can match below. + name = matches[0].toLowerCase().replace(/\s/g, ""); + // multiple authors + values[name] = content.trim(); + } + } + if (!matches && elementName && namePattern.test(elementName)) { + name = elementName; + if (content) { + // Convert to lowercase, remove any whitespace, and convert dots + // to colons so we can match below. + name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); + values[name] = content.trim(); + } + } + }); + + // get title + metadata.title = jsonld.title || + values["dc:title"] || + values["dcterm:title"] || + values["og:title"] || + values["weibo:article:title"] || + values["weibo:webpage:title"] || + values["title"] || + values["twitter:title"] || + values["parsely-title"]; + + if (!metadata.title) { + metadata.title = this._getArticleTitle(); + } + + // get author + metadata.byline = jsonld.byline || + values["dc:creator"] || + values["dcterm:creator"] || + values["author"] || + values["parsely-author"]; + + // get description + metadata.excerpt = jsonld.excerpt || + values["dc:description"] || + values["dcterm:description"] || + values["og:description"] || + values["weibo:article:description"] || + values["weibo:webpage:description"] || + values["description"] || + values["twitter:description"]; + + // get site name + metadata.siteName = jsonld.siteName || + values["og:site_name"]; + + // get article published time + metadata.publishedTime = jsonld.datePublished || + values["article:published_time"] || + values["parsely-pub-date"] || + null; + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadata.title = this._unescapeHtmlEntities(metadata.title); + metadata.byline = this._unescapeHtmlEntities(metadata.byline); + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); + metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime); + + return metadata; + }, + + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param Element + **/ + _isSingleImage: function(node) { + if (node.tagName === "IMG") { + return true; + } + + if (node.children.length !== 1 || node.textContent.trim() !== "") { + return false; + } + + return this._isSingleImage(node.children[0]); + }, + + /** + * Find all