Auto-update: Thu Jun 27 13:16:34 PDT 2024

This commit is contained in:
sanj 2024-06-27 13:16:34 -07:00
parent b81c3c2948
commit 893b0da232
6 changed files with 140 additions and 43 deletions

View file

@ -12,7 +12,7 @@ from typing import List, Optional
import traceback import traceback
import logging import logging
from .logs import Logger from .logs import Logger
from .classes import AutoResponder, IMAPConfig, SMTPConfig, EmailAccount, EmailContact, IncomingEmail, TimezoneTracker, Database from .classes import AutoResponder, IMAPConfig, SMTPConfig, EmailAccount, EmailContact, IncomingEmail, TimezoneTracker, Database, PyGeolocator
# from sijapi.config.config import load_config # from sijapi.config.config import load_config
# cfg = load_config() # cfg = load_config()
@ -56,7 +56,6 @@ os.makedirs(REQUESTS_DIR, exist_ok=True)
REQUESTS_LOG_PATH = LOGS_DIR / "requests.log" REQUESTS_LOG_PATH = LOGS_DIR / "requests.log"
### LOCATE AND WEATHER LOCALIZATIONS ### LOCATE AND WEATHER LOCALIZATIONS
USER_FULLNAME = os.getenv('USER_FULLNAME') USER_FULLNAME = os.getenv('USER_FULLNAME')
USER_BIO = os.getenv('USER_BIO') USER_BIO = os.getenv('USER_BIO')
@ -68,7 +67,8 @@ VISUALCROSSING_API_KEY = os.getenv("VISUALCROSSING_API_KEY")
GEONAMES_TXT = DATA_DIR / "geonames.txt" GEONAMES_TXT = DATA_DIR / "geonames.txt"
LOCATIONS_CSV = DATA_DIR / "US.csv" LOCATIONS_CSV = DATA_DIR / "US.csv"
TZ = tz.gettz(os.getenv("TZ", "America/Los_Angeles")) TZ = tz.gettz(os.getenv("TZ", "America/Los_Angeles"))
DynamicTZ = TimezoneTracker(DB) TZ_CACHE = DATA_DIR / "tzcache.json"
DynamicTZ = TimezoneTracker(TZ_CACHE)
### Obsidian & notes ### Obsidian & notes
ALLOWED_FILENAME_CHARS = r'[^\w \.-]' ALLOWED_FILENAME_CHARS = r'[^\w \.-]'
@ -90,13 +90,14 @@ YEAR_FMT = os.getenv("YEAR_FMT")
MONTH_FMT = os.getenv("MONTH_FMT") MONTH_FMT = os.getenv("MONTH_FMT")
DAY_FMT = os.getenv("DAY_FMT") DAY_FMT = os.getenv("DAY_FMT")
DAY_SHORT_FMT = os.getenv("DAY_SHORT_FMT") DAY_SHORT_FMT = os.getenv("DAY_SHORT_FMT")
GEOLOCATOR = PyGeolocator
### Large language model ### Large language model
LLM_URL = os.getenv("LLM_URL", "http://localhost:11434") LLM_URL = os.getenv("LLM_URL", "http://localhost:11434")
LLM_SYS_MSG = os.getenv("SYSTEM_MSG", "You are a helpful AI assistant.") LLM_SYS_MSG = os.getenv("SYSTEM_MSG", "You are a helpful AI assistant.")
SUMMARY_INSTRUCT = os.getenv('SUMMARY_INSTRUCT', "You are an AI assistant that provides accurate summaries of text -- nothing more and nothing less. You must not include ANY extraneous text other than the sumary. Do not include comments apart from the summary, do not preface the summary, and do not provide any form of postscript. Do not add paragraph breaks. Do not add any kind of formatting. Your response should begin with, consist of, and end with an accurate plaintext summary.") SUMMARY_INSTRUCT = os.getenv('SUMMARY_INSTRUCT', "You are an AI assistant that provides accurate summaries of text -- nothing more and nothing less. You must not include ANY extraneous text other than the sumary. Do not include comments apart from the summary, do not preface the summary, and do not provide any form of postscript. Do not add paragraph breaks. Do not add any kind of formatting. Your response should begin with, consist of, and end with an accurate plaintext summary.")
SUMMARY_INSTRUCT_TTS = os.getenv('SUMMARY_INSTRUCT_TTS', "You are an AI assistant that provides email summaries for Sanjay. Your response will undergo Text-To-Speech conversion and added to Sanjay's private podcast. Providing adequate context (Sanjay did not send this question to you, he will only hear your response) but aiming for conciseness and precision, and bearing in mind the Text-To-Speech conversion (avoiding acronyms and formalities), summarize the following email.") SUMMARY_INSTRUCT_TTS = os.getenv('SUMMARY_INSTRUCT_TTS', "You are an AI assistant that provides email summaries for Sanjay. Your response will undergo Text-To-Speech conversion and added to Sanjay's private podcast. Providing adequate context (Sanjay did not send this question to you, he will only hear your response) but aiming for conciseness and precision, and bearing in mind the Text-To-Speech conversion (avoiding acronyms and formalities), summarize the following email.")
DEFAULT_LLM = os.getenv("DEFAULT_LLM", "dolphin-mistral") DEFAULT_LLM = os.getenv("DEFAULT_LLM", "llama3")
DEFAULT_VISION = os.getenv("DEFAULT_VISION", "llava") DEFAULT_VISION = os.getenv("DEFAULT_VISION", "llava")
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "Luna") DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "Luna")
DEFAULT_11L_VOICE = os.getenv("DEFAULT_11L_VOICE", "Victoria") DEFAULT_11L_VOICE = os.getenv("DEFAULT_11L_VOICE", "Victoria")

View file

@ -8,6 +8,11 @@ from pydantic import BaseModel, Field
from typing import Optional from typing import Optional
import asyncpg import asyncpg
import os import os
from typing import Optional, Tuple, Union
from datetime import datetime, timedelta
import json
from timezonefinder import TimezoneFinder
from pathlib import Path
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import Optional from typing import Optional
@ -21,6 +26,38 @@ from typing import Optional
import asyncpg import asyncpg
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
import reverse_geocoder as rg
from timezonefinder import TimezoneFinder
from srtm import get_data
class PyGeolocator:
def __init__(self):
self.tf = TimezoneFinder()
self.srtm_data = get_data()
def get_location(self, lat, lon):
result = rg.search((lat, lon))
return result[0]['name'], result[0]['admin1'], result[0]['cc']
def get_elevation(self, lat, lon):
return self.srtm_data.get_elevation(lat, lon)
def get_timezone(self, lat, lon):
return self.tf.timezone_at(lat=lat, lng=lon)
def lookup(self, lat, lon):
city, state, country = self.get_location(lat, lon)
elevation = self.get_elevation(lat, lon)
timezone = self.get_timezone(lat, lon)
return {
"city": city,
"state": state,
"country": country,
"elevation": elevation,
"timezone": timezone
}
class Database(BaseModel): class Database(BaseModel):
host: str = Field(..., description="Database host") host: str = Field(..., description="Database host")
port: int = Field(5432, description="Database port") port: int = Field(5432, description="Database port")
@ -108,7 +145,6 @@ class IncomingEmail(BaseModel):
attachments: List[dict] = [] attachments: List[dict] = []
class Location(BaseModel): class Location(BaseModel):
latitude: float latitude: float
longitude: float longitude: float
@ -141,24 +177,18 @@ class Location(BaseModel):
} }
class TimezoneTracker: class TimezoneTracker:
def __init__(self, db_config: Database, cache_file: str = 'timezone_cache.json'): def __init__(self, cache_file: Union[str, Path] = 'timezone_cache.json'):
self.db_config = db_config self.cache_file = Path(cache_file)
self.cache_file = cache_file
self.last_timezone: str = "America/Los_Angeles" self.last_timezone: str = "America/Los_Angeles"
self.last_update: Optional[datetime] = None self.last_update: Optional[datetime] = None
self.last_location: Optional[Tuple[float, float]] = None self.last_location: Optional[Tuple[float, float]] = None
self.tf = TimezoneFinder()
async def find(self, lat: float, lon: float) -> str: def find(self, lat: float, lon: float) -> str:
query = """ timezone = self.tf.timezone_at(lat=lat, lng=lon)
SELECT tzid return timezone if timezone else 'Unknown'
FROM timezones
WHERE ST_Contains(geom, ST_SetSRID(ST_MakePoint($1, $2), 4326))
LIMIT 1;
"""
async with await self.db_config.get_connection() as conn:
result = await conn.fetchrow(query, lon, lat)
return result['tzid'] if result else 'Unknown'
async def refresh(self, location: Union[Location, Tuple[float, float]], force: bool = False) -> str: async def refresh(self, location: Union[Location, Tuple[float, float]], force: bool = False) -> str:
if isinstance(location, Location): if isinstance(location, Location):
@ -167,22 +197,15 @@ class TimezoneTracker:
lat, lon = location lat, lon = location
current_time = datetime.now() current_time = datetime.now()
if (force or if (force or
not self.last_update or not self.last_update or
current_time - self.last_update > timedelta(hours=1) or current_time - self.last_update > timedelta(hours=1) or
self.last_location != (lat, lon)): self.last_location != (lat, lon)):
new_timezone = self.find(lat, lon)
new_timezone = await self.find(lat, lon)
self.last_timezone = new_timezone self.last_timezone = new_timezone
self.last_update = current_time self.last_update = current_time
self.last_location = (lat, lon) self.last_location = (lat, lon)
await self.save_to_cache() await self.save_to_cache()
return new_timezone
return self.last_timezone return self.last_timezone
async def save_to_cache(self): async def save_to_cache(self):
@ -191,12 +214,12 @@ class TimezoneTracker:
'last_update': self.last_update.isoformat() if self.last_update else None, 'last_update': self.last_update.isoformat() if self.last_update else None,
'last_location': self.last_location 'last_location': self.last_location
} }
with open(self.cache_file, 'w') as f: with self.cache_file.open('w') as f:
json.dump(cache_data, f) json.dump(cache_data, f)
async def load_from_cache(self): async def load_from_cache(self):
try: try:
with open(self.cache_file, 'r') as f: with self.cache_file.open('r') as f:
cache_data = json.load(f) cache_data = json.load(f)
self.last_timezone = cache_data.get('last_timezone') self.last_timezone = cache_data.get('last_timezone')
self.last_update = datetime.fromisoformat(cache_data['last_update']) if cache_data.get('last_update') else None self.last_update = datetime.fromisoformat(cache_data['last_update']) if cache_data.get('last_update') else None
@ -208,7 +231,7 @@ class TimezoneTracker:
async def get_current(self, location: Union[Location, Tuple[float, float]]) -> str: async def get_current(self, location: Union[Location, Tuple[float, float]]) -> str:
await self.load_from_cache() await self.load_from_cache()
return await self.refresh(location) return await self.refresh(location)
async def get_last(self) -> Optional[str]: async def get_last(self) -> Optional[str]:
await self.load_from_cache() await self.load_from_cache()
return self.last_timezone return self.last_timezone

51
sijapi/data/osm.sh Executable file
View file

@ -0,0 +1,51 @@
#!/bin/bash
set -e # Exit immediately if a command exits with a non-zero status.
# Set variables
DB_NAME="sij"
DB_USER="sij"
OSM_FILE="north-america-latest.osm.pbf"
FLAT_NODES="/Users/sij/workshop/sijapi/sijapi/data/db/flat-nodes.bin"
# Ensure the directory for flat-nodes exists
mkdir -p "$(dirname "$FLAT_NODES")"
# Determine total system memory in MB
TOTAL_MEM=$(sysctl hw.memsize | awk '{print $2 / 1024 / 1024}')
# Calculate cache size (50% of total memory, max 32GB)
CACHE_SIZE=$(echo "scale=0; $TOTAL_MEM * 0.5 / 1" | bc)
CACHE_SIZE=$(( CACHE_SIZE > 32768 ? 32768 : CACHE_SIZE ))
# Calculate number of processes (number of CPU cores minus 1, min 1)
NUM_PROCESSES=$(sysctl -n hw.ncpu)
NUM_PROCESSES=$(( NUM_PROCESSES > 1 ? NUM_PROCESSES - 1 : 1 ))
echo "Starting OSM data import..."
# Run osm2pgsql
osm2pgsql -d $DB_NAME \
--create \
--slim \
-G \
--hstore \
--tag-transform-script /opt/homebrew/Cellar/osm2pgsql/1.11.0_1/share/osm2pgsql/openstreetmap-carto.lua \
-C $CACHE_SIZE \
--number-processes $NUM_PROCESSES \
-S /opt/homebrew/Cellar/osm2pgsql/1.11.0_1/share/osm2pgsql/default.style \
--prefix osm \
-H localhost \
-P 5432 \
-U $DB_USER \
--flat-nodes $FLAT_NODES \
$OSM_FILE
echo "OSM data import completed. Creating indexes..."
# Create indexes (adjust table names if necessary)
psql -d $DB_NAME -U $DB_USER -c "CREATE INDEX IF NOT EXISTS idx_osm_point_way ON osm_point USING GIST (way);"
psql -d $DB_NAME -U $DB_USER -c "CREATE INDEX IF NOT EXISTS idx_osm_line_way ON osm_line USING GIST (way);"
psql -d $DB_NAME -U $DB_USER -c "CREATE INDEX IF NOT EXISTS idx_osm_polygon_way ON osm_polygon USING GIST (way);"
echo "Import completed and indexes created."

View file

@ -204,7 +204,7 @@ tags:
with open(md_path, 'w', encoding='utf-8') as md_file: with open(md_path, 'w', encoding='utf-8') as md_file:
md_file.write(markdown_content) md_file.write(markdown_content)
L.INFO(f"Saved markdown to {md_path}") L.DEBUG(f"Saved markdown to {md_path}")
return True return True
@ -221,7 +221,12 @@ async def autorespond(this_email: IncomingEmail, account: EmailAccount):
auto_response_subject = f"Auto-Response Re: {this_email.subject}" auto_response_subject = f"Auto-Response Re: {this_email.subject}"
auto_response_body = await generate_auto_response_body(this_email, profile, account) auto_response_body = await generate_auto_response_body(this_email, profile, account)
L.DEBUG(f"Auto-response: {auto_response_body}") L.DEBUG(f"Auto-response: {auto_response_body}")
await send_auto_response(this_email.sender, auto_response_subject, auto_response_body, profile, account) success = await send_auto_response(this_email.sender, auto_response_subject, auto_response_body, profile, account)
if success == True:
return True
L.WARN(f"We were not able to successfully auto-respond to {this_email.subject}")
return False
async def send_auto_response(to_email, subject, body, profile, account): async def send_auto_response(to_email, subject, body, profile, account):
try: try:
@ -264,13 +269,16 @@ async def save_processed_uid(filename: Path, account_name: str, uid: str):
f.write(f"{account_name}:{uid}\n") f.write(f"{account_name}:{uid}\n")
async def process_account_summarization(account: EmailAccount): async def process_account_summarization(account: EmailAccount):
summarized_log = EMAIL_LOGS / "summarized.txt" summarized_log = EMAIL_LOGS / account.name / "summarized.txt"
os.makedirs(summarized_log.parent, exist_ok = True)
while True: while True:
try: try:
processed_uids = await load_processed_uids(summarized_log) processed_uids = await load_processed_uids(summarized_log)
L.DEBUG(f"{len(processed_uids)} emails marked as already summarized are being ignored.")
with get_imap_connection(account) as inbox: with get_imap_connection(account) as inbox:
unread_messages = inbox.messages(unread=True) unread_messages = inbox.messages(unread=True)
L.DEBUG(f"There are {len(unread_messages)} unread messages.")
for uid, message in unread_messages: for uid, message in unread_messages:
uid_str = uid.decode() if isinstance(uid, bytes) else str(uid) uid_str = uid.decode() if isinstance(uid, bytes) else str(uid)
if uid_str not in processed_uids: if uid_str not in processed_uids:
@ -282,27 +290,35 @@ async def process_account_summarization(account: EmailAccount):
recipients=recipients, recipients=recipients,
subject=message.subject, subject=message.subject,
body=clean_email_content(message.body['html'][0]) if message.body['html'] else clean_email_content(message.body['plain'][0]) or "", body=clean_email_content(message.body['html'][0]) if message.body['html'] else clean_email_content(message.body['plain'][0]) or "",
attachments=message.attachments attachments=message.attachments
) )
if account.summarize: if account.summarize:
save_success = await save_email(this_email, account) save_success = await save_email(this_email, account)
if save_success: if save_success:
await save_processed_uid(summarized_log, account.name, uid_str) await save_processed_uid(summarized_log, account.name, uid_str)
L.INFO(f"Summarized email: {uid_str}") L.INFO(f"Summarized email: {uid_str}")
else:
L.WARN(f"Failed to summarize {this_email.subject}")
else:
L.INFO(f"account.summarize shows as false.")
else:
L.DEBUG(f"Skipping {uid_str} because it was already processed.")
except Exception as e: except Exception as e:
L.ERR(f"An error occurred during summarization for account {account.name}: {e}") L.ERR(f"An error occurred during summarization for account {account.name}: {e}")
await asyncio.sleep(account.refresh) await asyncio.sleep(account.refresh)
async def process_account_autoresponding(account: EmailAccount): async def process_account_autoresponding(account: EmailAccount):
autoresponded_log = EMAIL_LOGS / "autoresponded.txt" autoresponded_log = EMAIL_LOGS / account.name / "autoresponded.txt"
os.makedirs(autoresponded_log.parent, exist_ok = True)
while True: while True:
try: try:
processed_uids = await load_processed_uids(autoresponded_log) processed_uids = await load_processed_uids(autoresponded_log)
L.DEBUG(f"{len(processed_uids)} already processed emails are being ignored.") L.DEBUG(f"{len(processed_uids)} emails marked as already responded to are being ignored.")
with get_imap_connection(account) as inbox: with get_imap_connection(account) as inbox:
unread_messages = inbox.messages(unread=True) unread_messages = inbox.messages(unread=True)
L.DEBUG(f"There are {len(unread_messages)} unread messages.")
for uid, message in unread_messages: for uid, message in unread_messages:
uid_str = uid.decode() if isinstance(uid, bytes) else str(uid) uid_str = uid.decode() if isinstance(uid, bytes) else str(uid)
if uid_str not in processed_uids: if uid_str not in processed_uids:
@ -320,7 +336,10 @@ async def process_account_autoresponding(account: EmailAccount):
respond_success = await autorespond(this_email, account) respond_success = await autorespond(this_email, account)
if respond_success: if respond_success:
await save_processed_uid(autoresponded_log, account.name, uid_str) await save_processed_uid(autoresponded_log, account.name, uid_str)
L.WARN(f"Auto-responded to email: {uid_str}") L.WARN(f"Auto-responded to email: {this_email.subject}")
else:
L.WARN(f"Failed auto-response to {this_email.subject}")
L.DEBUG(f"Skipping {uid_str} because it was already processed.")
except Exception as e: except Exception as e:
L.ERR(f"An error occurred during auto-responding for account {account.name}: {e}") L.ERR(f"An error occurred during auto-responding for account {account.name}: {e}")

View file

@ -211,7 +211,7 @@ async def process_for_daily_note(file: Optional[UploadFile] = File(None), text:
text_entry = text if text else "" text_entry = text if text else ""
L.INFO(f"transcription: {transcription}\nfile_entry: {file_entry}\ntext_entry: {text_entry}") L.DEBUG(f"transcription: {transcription}\nfile_entry: {file_entry}\ntext_entry: {text_entry}")
return await add_to_daily_note(transcription, file_entry, text_entry, now) return await add_to_daily_note(transcription, file_entry, text_entry, now)
@ -520,19 +520,22 @@ async def process_archive(
markdown_content = f"---\n" markdown_content = f"---\n"
markdown_content += f"title: {readable_title}\n" markdown_content += f"title: {readable_title}\n"
markdown_content += f"added: {timestamp}\n" markdown_content += f"added: {timestamp}\n"
markdown_content += f"url: {url}"
markdown_content += f"date: {datetime.now().strftime('%Y-%m-%d')}"
markdown_content += f"---\n\n" markdown_content += f"---\n\n"
markdown_content += f"# {readable_title}\n\n" markdown_content += f"# {readable_title}\n\n"
markdown_content += f"Clipped from [{url}]({url}) on {timestamp}"
markdown_content += content markdown_content += content
try: try:
markdown_path.parent.mkdir(parents=True, exist_ok=True) markdown_path.parent.mkdir(parents=True, exist_ok=True)
with open(markdown_path, 'w', encoding=encoding) as md_file: with open(markdown_path, 'w', encoding=encoding) as md_file:
md_file.write(markdown_content) md_file.write(markdown_content)
L.INFO(f"Successfully saved to {markdown_path}") L.DEBUG(f"Successfully saved to {markdown_path}")
return markdown_path return markdown_path
except Exception as e: except Exception as e:
L.ERR(f"Failed to write markdown file: {str(e)}") L.WARN(f"Failed to write markdown file: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to write markdown file: {str(e)}") return None
def download_file(url, folder): def download_file(url, folder):
os.makedirs(folder, exist_ok=True) os.makedirs(folder, exist_ok=True)
@ -800,7 +803,7 @@ async def update_dn_weather(date_time: datetime):
city = city if city else loc.city city = city if city else loc.city
city = city if city else loc.house_number + ' ' + loc.road city = city if city else loc.house_number + ' ' + loc.road
L.INFO(f"City geocoded: {city}") L.DEBUG(f"City geocoded: {city}")
# Assemble journal path # Assemble journal path
absolute_path, relative_path = assemble_journal_path(date_time, filename="Weather", extension=".md", no_timestamp = True) absolute_path, relative_path = assemble_journal_path(date_time, filename="Weather", extension=".md", no_timestamp = True)

View file

@ -295,7 +295,7 @@ async def extract_text_from_pdf(file_path: str) -> str:
L.ERR(f"Error extracting text with pdfminer.six: {e}") L.ERR(f"Error extracting text with pdfminer.six: {e}")
# If both methods fail or are deemed insufficient, use OCR as the last resort # If both methods fail or are deemed insufficient, use OCR as the last resort
L.INFO("Falling back to OCR for text extraction...") L.DEBUG("Falling back to OCR for text extraction...")
return await ocr_pdf(file_path) return await ocr_pdf(file_path)
async def is_valid_pdf(file_path: str) -> bool: async def is_valid_pdf(file_path: str) -> bool:
@ -331,7 +331,7 @@ async def extract_text_from_pdf(file_path: str) -> str:
L.ERR(f"Error extracting text with pdfminer.six: {str(e)}") L.ERR(f"Error extracting text with pdfminer.six: {str(e)}")
# Fall back to OCR # Fall back to OCR
L.INFO("Falling back to OCR for text extraction...") L.DEBUG("Falling back to OCR for text extraction...")
try: try:
images = convert_from_path(file_path) images = convert_from_path(file_path)
ocr_texts = await asyncio.gather(*(asyncio.to_thread(pytesseract.image_to_string, img) for img in images)) ocr_texts = await asyncio.gather(*(asyncio.to_thread(pytesseract.image_to_string, img) for img in images))