Sanitized mirror from private repository - 2026-04-20 01:32:01 UTC

2026-04-20 01:32:01 +00:00
commit e7652c8dab
1445 changed files with 364095 additions and 0 deletions
--- a/scripts/subscription-auditor.py
+++ b/scripts/subscription-auditor.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+"""Feature 16 — Subscription Auditor.
+
+Monthly audit of email subscriptions across all organizer accounts.
+Classifies senders, identifies dormant subscriptions, and sends a report.
+
+Cron: 0 9 1 * * cd /home/homelab/organized/repos/homelab && python3 scripts/subscription-auditor.py
+"""
+
+import argparse
+import logging
+import sqlite3
+import sys
+from collections import defaultdict
+from datetime import datetime, timedelta
+from pathlib import Path
+from zoneinfo import ZoneInfo
+
+sys.path.insert(0, str(Path(__file__).parent))
+from lib.ollama import ollama_generate, ollama_available, OllamaUnavailableError
+from lib.notify import send_email
+
+log = logging.getLogger(__name__)
+
+SCRIPTS_DIR = Path("/home/homelab/organized/repos/homelab/scripts")
+DORMANT_DAYS = 90
+
+ACCOUNTS = [
+    {
+        "name": "lzbellina92@gmail.com",
+        "db": SCRIPTS_DIR / "gmail-organizer" / "processed.db",
+    },
+    {
+        "name": "your-email@example.com",
+        "db": SCRIPTS_DIR / "gmail-organizer-dvish" / "processed.db",
+    },
+    {
+        "name": "admin@thevish.io",
+        "db": SCRIPTS_DIR / "proton-organizer" / "processed.db",
+    },
+]
+
+
+# ── data gathering ───────────────────────────────────────────────────────────
+
+def get_subscription_senders(db_path: Path) -> list[dict]:
+    """Query processed.db for accounts and newsletters categories, plus sender_cache."""
+    if not db_path.exists():
+        log.warning("DB not found: %s", db_path)
+        return []
+
+    conn = sqlite3.connect(db_path)
+    results = []
+
+    # Get processed entries for accounts and newsletters
+    rows = conn.execute(
+        "SELECT message_id, category, processed_at FROM processed "
+        "WHERE category IN ('accounts', 'newsletters') "
+        "ORDER BY processed_at DESC"
+    ).fetchall()
+
+    for msg_id, category, processed_at in rows:
+        results.append({
+            "message_id": msg_id,
+            "category": category,
+            "processed_at": processed_at,
+            "source": "processed",
+        })
+
+    # Also pull from sender_cache if it exists
+    try:
+        cache_rows = conn.execute(
+            "SELECT sender, category, hit_count, last_seen FROM sender_cache "
+            "WHERE category IN ('accounts', 'newsletters') "
+            "ORDER BY hit_count DESC"
+        ).fetchall()
+        for sender, category, hit_count, last_seen in cache_rows:
+            results.append({
+                "sender": sender,
+                "category": category,
+                "hit_count": hit_count,
+                "last_seen": last_seen,
+                "source": "sender_cache",
+            })
+    except sqlite3.OperationalError:
+        log.debug("No sender_cache table in %s", db_path)
+
+    conn.close()
+    return results
+
+
+def aggregate_senders(all_data: list[dict]) -> dict:
+    """Aggregate sender data. Returns dict of sender -> stats."""
+    senders = defaultdict(lambda: {
+        "count": 0,
+        "last_seen": "",
+        "categories": set(),
+        "accounts": set(),
+    })
+
+    for entry in all_data:
+        account_name = entry.get("account", "unknown")
+
+        if entry.get("source") == "sender_cache":
+            sender = entry.get("sender", "unknown")
+            info = senders[sender]
+            info["count"] += entry.get("hit_count", 1)
+            if entry.get("last_seen", "") > info["last_seen"]:
+                info["last_seen"] = entry["last_seen"]
+            info["categories"].add(entry.get("category", ""))
+            info["accounts"].add(account_name)
+        else:
+            # For processed entries, extract sender domain from message_id as proxy
+            msg_id = entry.get("message_id", "")
+            # Message-IDs often contain sender domain: <xxx@domain.com>
+            sender = _extract_domain_from_message_id(msg_id)
+            info = senders[sender]
+            info["count"] += 1
+            if entry.get("processed_at", "") > info["last_seen"]:
+                info["last_seen"] = entry["processed_at"]
+            info["categories"].add(entry.get("category", ""))
+            info["accounts"].add(account_name)
+
+    return dict(senders)
+
+
+def _extract_domain_from_message_id(msg_id: str) -> str:
+    """Extract domain from Message-ID for sender approximation."""
+    # Message-IDs like <abc123@mail.example.com>
+    if "@" in msg_id:
+        domain = msg_id.split("@")[-1].strip(">").strip()
+        return domain
+    return msg_id[:30] if msg_id else "unknown"
+
+
+# ── classification ───────────────────────────────────────────────────────────
+
+def classify_senders(senders: dict) -> dict:
+    """Classify senders as dormant, active_subscription, one_time_notification, or marketing."""
+    cutoff = (datetime.now(tz=ZoneInfo("UTC")) - timedelta(days=DORMANT_DAYS)).isoformat()
+
+    dormant = {}
+    to_classify = {}
+
+    for sender, info in senders.items():
+        if info["last_seen"] and info["last_seen"] < cutoff:
+            info["classification"] = "dormant"
+            dormant[sender] = info
+        else:
+            to_classify[sender] = info
+
+    # Use LLM to classify non-dormant senders
+    if to_classify and ollama_available():
+        classified = _llm_classify(to_classify)
+        for sender, cls in classified.items():
+            if sender in to_classify:
+                to_classify[sender]["classification"] = cls
+        # Any unclassified get "unknown"
+        for sender, info in to_classify.items():
+            if "classification" not in info:
+                info["classification"] = "unknown"
+    else:
+        if to_classify and not ollama_available():
+            log.warning("Ollama not available, marking all non-dormant as 'unknown'")
+        for sender, info in to_classify.items():
+            info["classification"] = "unknown"
+
+    all_classified = {}
+    all_classified.update(dormant)
+    all_classified.update(to_classify)
+    return all_classified
+
+
+def _llm_classify(senders: dict) -> dict:
+    """Classify senders via LLM in batches."""
+    results = {}
+    sender_list = list(senders.items())
+
+    # Process in batches of 30
+    batch_size = 30
+    for i in range(0, len(sender_list), batch_size):
+        batch = sender_list[i:i + batch_size]
+        sender_lines = []
+        for sender, info in batch:
+            cats = ", ".join(info["categories"])
+            sender_lines.append(f"{sender} (count={info['count']}, categories={cats})")
+
+        prompt = (
+            "Classify each email sender/domain as: active_subscription, "
+            "one_time_notification, or marketing.\n"
+            "Format: one line per sender, exactly: SENDER: CLASSIFICATION\n"
+            "No extra text.\n\n"
+            + "\n".join(sender_lines)
+        )
+
+        try:
+            response = ollama_generate(prompt, temperature=0.1, num_predict=2000)
+            for line in response.strip().split("\n"):
+                line = line.strip()
+                if ":" not in line:
+                    continue
+                parts = line.rsplit(":", 1)
+                if len(parts) != 2:
+                    continue
+                s = parts[0].strip()
+                c = parts[1].strip().lower().replace(" ", "_")
+                # Match sender back to our keys
+                for sender, _ in batch:
+                    if sender in s or s in sender:
+                        if c in ("active_subscription", "one_time_notification", "marketing"):
+                            results[sender] = c
+                        break
+        except OllamaUnavailableError as e:
+            log.warning("LLM classification failed: %s", e)
+            break
+
+    return results
+
+
+# ── report ───────────────────────────────────────────────────────────────────
+
+def build_report(classified: dict) -> tuple[str, str]:
+    """Build text and HTML subscription audit report."""
+    now = datetime.now(tz=ZoneInfo("America/Los_Angeles"))
+
+    # Group by classification
+    groups = defaultdict(list)
+    for sender, info in classified.items():
+        groups[info.get("classification", "unknown")].append((sender, info))
+
+    # Sort each group by count descending
+    for cls in groups:
+        groups[cls].sort(key=lambda x: -x[1]["count"])
+
+    # Stats
+    total_senders = len(classified)
+    active_count = len(groups.get("active_subscription", []))
+    dormant_count = len(groups.get("dormant", []))
+    marketing_count = len(groups.get("marketing", []))
+    one_time_count = len(groups.get("one_time_notification", []))
+    unknown_count = len(groups.get("unknown", []))
+
+    # Text report
+    text_lines = [
+        f"Monthly Subscription Audit — {now.strftime('%Y-%m-%d %H:%M %Z')}",
+        "=" * 60,
+        "",
+        f"Total unique senders/domains: {total_senders}",
+        f"Active subscriptions:         {active_count}",
+        f"Marketing/promotional:        {marketing_count}",
+        f"One-time notifications:       {one_time_count}",
+        f"Dormant (>{DORMANT_DAYS} days):       {dormant_count}",
+        f"Unclassified:                 {unknown_count}",
+        "",
+    ]
+
+    # Classification order for display
+    display_order = [
+        ("active_subscription", "Active Subscriptions"),
+        ("marketing", "Marketing / Promotional"),
+        ("dormant", "Dormant (Potential Unsubscribes)"),
+        ("one_time_notification", "One-Time Notifications"),
+        ("unknown", "Unclassified"),
+    ]
+
+    for cls_key, cls_label in display_order:
+        items = groups.get(cls_key, [])
+        if not items:
+            continue
+        text_lines.append(f"--- {cls_label} ({len(items)}) ---")
+        for sender, info in items:
+            cats = ", ".join(info["categories"])
+            accts = ", ".join(info["accounts"])
+            text_lines.append(
+                f"  {sender:40s}  count={info['count']:>4}  last={info['last_seen'][:10]}  "
+                f"cats=[{cats}]  accts=[{accts}]"
+            )
+        text_lines.append("")
+
+    text_body = "\n".join(text_lines)
+
+    # HTML report
+    html_parts = [
+        "<html><body>",
+        "<h2>Monthly Subscription Audit</h2>",
+        f"<p>{now.strftime('%Y-%m-%d %H:%M %Z')}</p>",
+        "<h3>Summary</h3>",
+        "<table border='1' cellpadding='6' cellspacing='0' style='border-collapse:collapse;'>",
+        f"<tr><td>Total unique senders</td><td><b>{total_senders}</b></td></tr>",
+        f"<tr><td>Active subscriptions</td><td>{active_count}</td></tr>",
+        f"<tr><td>Marketing/promotional</td><td>{marketing_count}</td></tr>",
+        f"<tr><td>One-time notifications</td><td>{one_time_count}</td></tr>",
+        f"<tr><td>Dormant (&gt;{DORMANT_DAYS} days)</td><td style='color:orange;'>{dormant_count}</td></tr>",
+        f"<tr><td>Unclassified</td><td>{unknown_count}</td></tr>",
+        "</table>",
+    ]
+
+    for cls_key, cls_label in display_order:
+        items = groups.get(cls_key, [])
+        if not items:
+            continue
+        html_parts.append(f"<h3>{cls_label} ({len(items)})</h3>")
+        html_parts.append(
+            "<table border='1' cellpadding='4' cellspacing='0' style='border-collapse:collapse; font-size:13px;'>"
+        )
+        html_parts.append(
+            "<tr><th>Sender/Domain</th><th>Count</th><th>Last Seen</th>"
+            "<th>Categories</th><th>Accounts</th></tr>"
+        )
+        for sender, info in items:
+            cats = ", ".join(info["categories"])
+            accts = ", ".join(info["accounts"])
+            last = info["last_seen"][:10] if info["last_seen"] else "N/A"
+            html_parts.append(
+                f"<tr><td>{sender}</td><td>{info['count']}</td>"
+                f"<td>{last}</td><td>{cats}</td><td>{accts}</td></tr>"
+            )
+        html_parts.append("</table>")
+
+    html_parts.append("<hr>")
+    html_parts.append(
+        f"<p><small>Generated {now.strftime('%Y-%m-%d %H:%M %Z')} by subscription-auditor.py</small></p>"
+    )
+    html_parts.append("</body></html>")
+
+    html_body = "\n".join(html_parts)
+
+    return text_body, html_body
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Monthly Subscription Auditor — analyze email subscriptions")
+    parser.add_argument("--dry-run", action="store_true", help="Print report without sending email")
+    parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+
+    # Gather data from all accounts
+    all_data = []
+    for account in ACCOUNTS:
+        account_name = account["name"]
+        log.info("Reading data from %s", account_name)
+        entries = get_subscription_senders(account["db"])
+        for entry in entries:
+            entry["account"] = account_name
+        all_data.extend(entries)
+        log.info("  Found %d subscription-related entries", len(entries))
+
+    if not all_data:
+        log.info("No subscription data found across any account.")
+        if args.dry_run:
+            print("No subscription data found.")
+        return
+
+    # Aggregate by sender
+    senders = aggregate_senders(all_data)
+    log.info("Aggregated %d unique senders/domains", len(senders))
+
+    # Classify
+    classified = classify_senders(senders)
+
+    # Build report
+    text_body, html_body = build_report(classified)
+
+    if args.dry_run:
+        print(text_body)
+        return
+
+    now = datetime.now(tz=ZoneInfo("America/Los_Angeles"))
+    subject = f"Monthly Subscription Audit — {now.strftime('%B %Y')}"
+    send_email(subject=subject, html_body=html_body, text_body=text_body)
+    log.info("Subscription audit report emailed")
+
+
+if __name__ == "__main__":
+    main()