#!/usr/bin/env python3 """Feature 16 — Subscription Auditor. Monthly audit of email subscriptions across all organizer accounts. Classifies senders, identifies dormant subscriptions, and sends a report. Cron: 0 9 1 * * cd /home/homelab/organized/repos/homelab && python3 scripts/subscription-auditor.py """ import argparse import logging import sqlite3 import sys from collections import defaultdict from datetime import datetime, timedelta from pathlib import Path from zoneinfo import ZoneInfo sys.path.insert(0, str(Path(__file__).parent)) from lib.ollama import ollama_generate, ollama_available, OllamaUnavailableError from lib.notify import send_email log = logging.getLogger(__name__) SCRIPTS_DIR = Path("/home/homelab/organized/repos/homelab/scripts") DORMANT_DAYS = 90 ACCOUNTS = [ { "name": "lzbellina92@gmail.com", "db": SCRIPTS_DIR / "gmail-organizer" / "processed.db", }, { "name": "your-email@example.com", "db": SCRIPTS_DIR / "gmail-organizer-dvish" / "processed.db", }, { "name": "admin@thevish.io", "db": SCRIPTS_DIR / "proton-organizer" / "processed.db", }, ] # ── data gathering ─────────────────────────────────────────────────────────── def get_subscription_senders(db_path: Path) -> list[dict]: """Query processed.db for accounts and newsletters categories, plus sender_cache.""" if not db_path.exists(): log.warning("DB not found: %s", db_path) return [] conn = sqlite3.connect(db_path) results = [] # Get processed entries for accounts and newsletters rows = conn.execute( "SELECT message_id, category, processed_at FROM processed " "WHERE category IN ('accounts', 'newsletters') " "ORDER BY processed_at DESC" ).fetchall() for msg_id, category, processed_at in rows: results.append({ "message_id": msg_id, "category": category, "processed_at": processed_at, "source": "processed", }) # Also pull from sender_cache if it exists try: cache_rows = conn.execute( "SELECT sender, category, hit_count, last_seen FROM sender_cache " "WHERE category IN ('accounts', 'newsletters') " "ORDER BY hit_count DESC" ).fetchall() for sender, category, hit_count, last_seen in cache_rows: results.append({ "sender": sender, "category": category, "hit_count": hit_count, "last_seen": last_seen, "source": "sender_cache", }) except sqlite3.OperationalError: log.debug("No sender_cache table in %s", db_path) conn.close() return results def aggregate_senders(all_data: list[dict]) -> dict: """Aggregate sender data. Returns dict of sender -> stats.""" senders = defaultdict(lambda: { "count": 0, "last_seen": "", "categories": set(), "accounts": set(), }) for entry in all_data: account_name = entry.get("account", "unknown") if entry.get("source") == "sender_cache": sender = entry.get("sender", "unknown") info = senders[sender] info["count"] += entry.get("hit_count", 1) if entry.get("last_seen", "") > info["last_seen"]: info["last_seen"] = entry["last_seen"] info["categories"].add(entry.get("category", "")) info["accounts"].add(account_name) else: # For processed entries, extract sender domain from message_id as proxy msg_id = entry.get("message_id", "") # Message-IDs often contain sender domain: sender = _extract_domain_from_message_id(msg_id) info = senders[sender] info["count"] += 1 if entry.get("processed_at", "") > info["last_seen"]: info["last_seen"] = entry["processed_at"] info["categories"].add(entry.get("category", "")) info["accounts"].add(account_name) return dict(senders) def _extract_domain_from_message_id(msg_id: str) -> str: """Extract domain from Message-ID for sender approximation.""" # Message-IDs like if "@" in msg_id: domain = msg_id.split("@")[-1].strip(">").strip() return domain return msg_id[:30] if msg_id else "unknown" # ── classification ─────────────────────────────────────────────────────────── def classify_senders(senders: dict) -> dict: """Classify senders as dormant, active_subscription, one_time_notification, or marketing.""" cutoff = (datetime.now(tz=ZoneInfo("UTC")) - timedelta(days=DORMANT_DAYS)).isoformat() dormant = {} to_classify = {} for sender, info in senders.items(): if info["last_seen"] and info["last_seen"] < cutoff: info["classification"] = "dormant" dormant[sender] = info else: to_classify[sender] = info # Use LLM to classify non-dormant senders if to_classify and ollama_available(): classified = _llm_classify(to_classify) for sender, cls in classified.items(): if sender in to_classify: to_classify[sender]["classification"] = cls # Any unclassified get "unknown" for sender, info in to_classify.items(): if "classification" not in info: info["classification"] = "unknown" else: if to_classify and not ollama_available(): log.warning("Ollama not available, marking all non-dormant as 'unknown'") for sender, info in to_classify.items(): info["classification"] = "unknown" all_classified = {} all_classified.update(dormant) all_classified.update(to_classify) return all_classified def _llm_classify(senders: dict) -> dict: """Classify senders via LLM in batches.""" results = {} sender_list = list(senders.items()) # Process in batches of 30 batch_size = 30 for i in range(0, len(sender_list), batch_size): batch = sender_list[i:i + batch_size] sender_lines = [] for sender, info in batch: cats = ", ".join(info["categories"]) sender_lines.append(f"{sender} (count={info['count']}, categories={cats})") prompt = ( "Classify each email sender/domain as: active_subscription, " "one_time_notification, or marketing.\n" "Format: one line per sender, exactly: SENDER: CLASSIFICATION\n" "No extra text.\n\n" + "\n".join(sender_lines) ) try: response = ollama_generate(prompt, temperature=0.1, num_predict=2000) for line in response.strip().split("\n"): line = line.strip() if ":" not in line: continue parts = line.rsplit(":", 1) if len(parts) != 2: continue s = parts[0].strip() c = parts[1].strip().lower().replace(" ", "_") # Match sender back to our keys for sender, _ in batch: if sender in s or s in sender: if c in ("active_subscription", "one_time_notification", "marketing"): results[sender] = c break except OllamaUnavailableError as e: log.warning("LLM classification failed: %s", e) break return results # ── report ─────────────────────────────────────────────────────────────────── def build_report(classified: dict) -> tuple[str, str]: """Build text and HTML subscription audit report.""" now = datetime.now(tz=ZoneInfo("America/Los_Angeles")) # Group by classification groups = defaultdict(list) for sender, info in classified.items(): groups[info.get("classification", "unknown")].append((sender, info)) # Sort each group by count descending for cls in groups: groups[cls].sort(key=lambda x: -x[1]["count"]) # Stats total_senders = len(classified) active_count = len(groups.get("active_subscription", [])) dormant_count = len(groups.get("dormant", [])) marketing_count = len(groups.get("marketing", [])) one_time_count = len(groups.get("one_time_notification", [])) unknown_count = len(groups.get("unknown", [])) # Text report text_lines = [ f"Monthly Subscription Audit — {now.strftime('%Y-%m-%d %H:%M %Z')}", "=" * 60, "", f"Total unique senders/domains: {total_senders}", f"Active subscriptions: {active_count}", f"Marketing/promotional: {marketing_count}", f"One-time notifications: {one_time_count}", f"Dormant (>{DORMANT_DAYS} days): {dormant_count}", f"Unclassified: {unknown_count}", "", ] # Classification order for display display_order = [ ("active_subscription", "Active Subscriptions"), ("marketing", "Marketing / Promotional"), ("dormant", "Dormant (Potential Unsubscribes)"), ("one_time_notification", "One-Time Notifications"), ("unknown", "Unclassified"), ] for cls_key, cls_label in display_order: items = groups.get(cls_key, []) if not items: continue text_lines.append(f"--- {cls_label} ({len(items)}) ---") for sender, info in items: cats = ", ".join(info["categories"]) accts = ", ".join(info["accounts"]) text_lines.append( f" {sender:40s} count={info['count']:>4} last={info['last_seen'][:10]} " f"cats=[{cats}] accts=[{accts}]" ) text_lines.append("") text_body = "\n".join(text_lines) # HTML report html_parts = [ "", "

Monthly Subscription Audit

", f"

{now.strftime('%Y-%m-%d %H:%M %Z')}

", "

Summary

", "", f"", f"", f"", f"", f"", f"", "
Total unique senders{total_senders}
Active subscriptions{active_count}
Marketing/promotional{marketing_count}
One-time notifications{one_time_count}
Dormant (>{DORMANT_DAYS} days){dormant_count}
Unclassified{unknown_count}
", ] for cls_key, cls_label in display_order: items = groups.get(cls_key, []) if not items: continue html_parts.append(f"

{cls_label} ({len(items)})

") html_parts.append( "" ) html_parts.append( "" "" ) for sender, info in items: cats = ", ".join(info["categories"]) accts = ", ".join(info["accounts"]) last = info["last_seen"][:10] if info["last_seen"] else "N/A" html_parts.append( f"" f"" ) html_parts.append("
Sender/DomainCountLast SeenCategoriesAccounts
{sender}{info['count']}{last}{cats}{accts}
") html_parts.append("
") html_parts.append( f"

Generated {now.strftime('%Y-%m-%d %H:%M %Z')} by subscription-auditor.py

" ) html_parts.append("") html_body = "\n".join(html_parts) return text_body, html_body # ── main ───────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Monthly Subscription Auditor — analyze email subscriptions") parser.add_argument("--dry-run", action="store_true", help="Print report without sending email") parser.add_argument("--verbose", action="store_true", help="Enable debug logging") args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", ) # Gather data from all accounts all_data = [] for account in ACCOUNTS: account_name = account["name"] log.info("Reading data from %s", account_name) entries = get_subscription_senders(account["db"]) for entry in entries: entry["account"] = account_name all_data.extend(entries) log.info(" Found %d subscription-related entries", len(entries)) if not all_data: log.info("No subscription data found across any account.") if args.dry_run: print("No subscription data found.") return # Aggregate by sender senders = aggregate_senders(all_data) log.info("Aggregated %d unique senders/domains", len(senders)) # Classify classified = classify_senders(senders) # Build report text_body, html_body = build_report(classified) if args.dry_run: print(text_body) return now = datetime.now(tz=ZoneInfo("America/Los_Angeles")) subject = f"Monthly Subscription Audit — {now.strftime('%B %Y')}" send_email(subject=subject, html_body=html_body, text_body=text_body) log.info("Subscription audit report emailed") if __name__ == "__main__": main()