Files
homelab-optimized/scripts/subscription-auditor.py
Gitea Mirror Bot af2cf711f4
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m0s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-18 11:03:33 UTC
2026-04-18 11:03:33 +00:00

383 lines
14 KiB
Python

#!/usr/bin/env python3
"""Feature 16 — Subscription Auditor.
Monthly audit of email subscriptions across all organizer accounts.
Classifies senders, identifies dormant subscriptions, and sends a report.
Cron: 0 9 1 * * cd /home/homelab/organized/repos/homelab && python3 scripts/subscription-auditor.py
"""
import argparse
import logging
import sqlite3
import sys
from collections import defaultdict
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo
sys.path.insert(0, str(Path(__file__).parent))
from lib.ollama import ollama_generate, ollama_available, OllamaUnavailableError
from lib.notify import send_email
log = logging.getLogger(__name__)
SCRIPTS_DIR = Path("/home/homelab/organized/repos/homelab/scripts")
DORMANT_DAYS = 90
ACCOUNTS = [
{
"name": "lzbellina92@gmail.com",
"db": SCRIPTS_DIR / "gmail-organizer" / "processed.db",
},
{
"name": "your-email@example.com",
"db": SCRIPTS_DIR / "gmail-organizer-dvish" / "processed.db",
},
{
"name": "admin@thevish.io",
"db": SCRIPTS_DIR / "proton-organizer" / "processed.db",
},
]
# ── data gathering ───────────────────────────────────────────────────────────
def get_subscription_senders(db_path: Path) -> list[dict]:
"""Query processed.db for accounts and newsletters categories, plus sender_cache."""
if not db_path.exists():
log.warning("DB not found: %s", db_path)
return []
conn = sqlite3.connect(db_path)
results = []
# Get processed entries for accounts and newsletters
rows = conn.execute(
"SELECT message_id, category, processed_at FROM processed "
"WHERE category IN ('accounts', 'newsletters') "
"ORDER BY processed_at DESC"
).fetchall()
for msg_id, category, processed_at in rows:
results.append({
"message_id": msg_id,
"category": category,
"processed_at": processed_at,
"source": "processed",
})
# Also pull from sender_cache if it exists
try:
cache_rows = conn.execute(
"SELECT sender, category, hit_count, last_seen FROM sender_cache "
"WHERE category IN ('accounts', 'newsletters') "
"ORDER BY hit_count DESC"
).fetchall()
for sender, category, hit_count, last_seen in cache_rows:
results.append({
"sender": sender,
"category": category,
"hit_count": hit_count,
"last_seen": last_seen,
"source": "sender_cache",
})
except sqlite3.OperationalError:
log.debug("No sender_cache table in %s", db_path)
conn.close()
return results
def aggregate_senders(all_data: list[dict]) -> dict:
"""Aggregate sender data. Returns dict of sender -> stats."""
senders = defaultdict(lambda: {
"count": 0,
"last_seen": "",
"categories": set(),
"accounts": set(),
})
for entry in all_data:
account_name = entry.get("account", "unknown")
if entry.get("source") == "sender_cache":
sender = entry.get("sender", "unknown")
info = senders[sender]
info["count"] += entry.get("hit_count", 1)
if entry.get("last_seen", "") > info["last_seen"]:
info["last_seen"] = entry["last_seen"]
info["categories"].add(entry.get("category", ""))
info["accounts"].add(account_name)
else:
# For processed entries, extract sender domain from message_id as proxy
msg_id = entry.get("message_id", "")
# Message-IDs often contain sender domain: <xxx@domain.com>
sender = _extract_domain_from_message_id(msg_id)
info = senders[sender]
info["count"] += 1
if entry.get("processed_at", "") > info["last_seen"]:
info["last_seen"] = entry["processed_at"]
info["categories"].add(entry.get("category", ""))
info["accounts"].add(account_name)
return dict(senders)
def _extract_domain_from_message_id(msg_id: str) -> str:
"""Extract domain from Message-ID for sender approximation."""
# Message-IDs like <abc123@mail.example.com>
if "@" in msg_id:
domain = msg_id.split("@")[-1].strip(">").strip()
return domain
return msg_id[:30] if msg_id else "unknown"
# ── classification ───────────────────────────────────────────────────────────
def classify_senders(senders: dict) -> dict:
"""Classify senders as dormant, active_subscription, one_time_notification, or marketing."""
cutoff = (datetime.now(tz=ZoneInfo("UTC")) - timedelta(days=DORMANT_DAYS)).isoformat()
dormant = {}
to_classify = {}
for sender, info in senders.items():
if info["last_seen"] and info["last_seen"] < cutoff:
info["classification"] = "dormant"
dormant[sender] = info
else:
to_classify[sender] = info
# Use LLM to classify non-dormant senders
if to_classify and ollama_available():
classified = _llm_classify(to_classify)
for sender, cls in classified.items():
if sender in to_classify:
to_classify[sender]["classification"] = cls
# Any unclassified get "unknown"
for sender, info in to_classify.items():
if "classification" not in info:
info["classification"] = "unknown"
else:
if to_classify and not ollama_available():
log.warning("Ollama not available, marking all non-dormant as 'unknown'")
for sender, info in to_classify.items():
info["classification"] = "unknown"
all_classified = {}
all_classified.update(dormant)
all_classified.update(to_classify)
return all_classified
def _llm_classify(senders: dict) -> dict:
"""Classify senders via LLM in batches."""
results = {}
sender_list = list(senders.items())
# Process in batches of 30
batch_size = 30
for i in range(0, len(sender_list), batch_size):
batch = sender_list[i:i + batch_size]
sender_lines = []
for sender, info in batch:
cats = ", ".join(info["categories"])
sender_lines.append(f"{sender} (count={info['count']}, categories={cats})")
prompt = (
"Classify each email sender/domain as: active_subscription, "
"one_time_notification, or marketing.\n"
"Format: one line per sender, exactly: SENDER: CLASSIFICATION\n"
"No extra text.\n\n"
+ "\n".join(sender_lines)
)
try:
response = ollama_generate(prompt, temperature=0.1, num_predict=2000)
for line in response.strip().split("\n"):
line = line.strip()
if ":" not in line:
continue
parts = line.rsplit(":", 1)
if len(parts) != 2:
continue
s = parts[0].strip()
c = parts[1].strip().lower().replace(" ", "_")
# Match sender back to our keys
for sender, _ in batch:
if sender in s or s in sender:
if c in ("active_subscription", "one_time_notification", "marketing"):
results[sender] = c
break
except OllamaUnavailableError as e:
log.warning("LLM classification failed: %s", e)
break
return results
# ── report ───────────────────────────────────────────────────────────────────
def build_report(classified: dict) -> tuple[str, str]:
"""Build text and HTML subscription audit report."""
now = datetime.now(tz=ZoneInfo("America/Los_Angeles"))
# Group by classification
groups = defaultdict(list)
for sender, info in classified.items():
groups[info.get("classification", "unknown")].append((sender, info))
# Sort each group by count descending
for cls in groups:
groups[cls].sort(key=lambda x: -x[1]["count"])
# Stats
total_senders = len(classified)
active_count = len(groups.get("active_subscription", []))
dormant_count = len(groups.get("dormant", []))
marketing_count = len(groups.get("marketing", []))
one_time_count = len(groups.get("one_time_notification", []))
unknown_count = len(groups.get("unknown", []))
# Text report
text_lines = [
f"Monthly Subscription Audit — {now.strftime('%Y-%m-%d %H:%M %Z')}",
"=" * 60,
"",
f"Total unique senders/domains: {total_senders}",
f"Active subscriptions: {active_count}",
f"Marketing/promotional: {marketing_count}",
f"One-time notifications: {one_time_count}",
f"Dormant (>{DORMANT_DAYS} days): {dormant_count}",
f"Unclassified: {unknown_count}",
"",
]
# Classification order for display
display_order = [
("active_subscription", "Active Subscriptions"),
("marketing", "Marketing / Promotional"),
("dormant", "Dormant (Potential Unsubscribes)"),
("one_time_notification", "One-Time Notifications"),
("unknown", "Unclassified"),
]
for cls_key, cls_label in display_order:
items = groups.get(cls_key, [])
if not items:
continue
text_lines.append(f"--- {cls_label} ({len(items)}) ---")
for sender, info in items:
cats = ", ".join(info["categories"])
accts = ", ".join(info["accounts"])
text_lines.append(
f" {sender:40s} count={info['count']:>4} last={info['last_seen'][:10]} "
f"cats=[{cats}] accts=[{accts}]"
)
text_lines.append("")
text_body = "\n".join(text_lines)
# HTML report
html_parts = [
"<html><body>",
"<h2>Monthly Subscription Audit</h2>",
f"<p>{now.strftime('%Y-%m-%d %H:%M %Z')}</p>",
"<h3>Summary</h3>",
"<table border='1' cellpadding='6' cellspacing='0' style='border-collapse:collapse;'>",
f"<tr><td>Total unique senders</td><td><b>{total_senders}</b></td></tr>",
f"<tr><td>Active subscriptions</td><td>{active_count}</td></tr>",
f"<tr><td>Marketing/promotional</td><td>{marketing_count}</td></tr>",
f"<tr><td>One-time notifications</td><td>{one_time_count}</td></tr>",
f"<tr><td>Dormant (&gt;{DORMANT_DAYS} days)</td><td style='color:orange;'>{dormant_count}</td></tr>",
f"<tr><td>Unclassified</td><td>{unknown_count}</td></tr>",
"</table>",
]
for cls_key, cls_label in display_order:
items = groups.get(cls_key, [])
if not items:
continue
html_parts.append(f"<h3>{cls_label} ({len(items)})</h3>")
html_parts.append(
"<table border='1' cellpadding='4' cellspacing='0' style='border-collapse:collapse; font-size:13px;'>"
)
html_parts.append(
"<tr><th>Sender/Domain</th><th>Count</th><th>Last Seen</th>"
"<th>Categories</th><th>Accounts</th></tr>"
)
for sender, info in items:
cats = ", ".join(info["categories"])
accts = ", ".join(info["accounts"])
last = info["last_seen"][:10] if info["last_seen"] else "N/A"
html_parts.append(
f"<tr><td>{sender}</td><td>{info['count']}</td>"
f"<td>{last}</td><td>{cats}</td><td>{accts}</td></tr>"
)
html_parts.append("</table>")
html_parts.append("<hr>")
html_parts.append(
f"<p><small>Generated {now.strftime('%Y-%m-%d %H:%M %Z')} by subscription-auditor.py</small></p>"
)
html_parts.append("</body></html>")
html_body = "\n".join(html_parts)
return text_body, html_body
# ── main ─────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Monthly Subscription Auditor — analyze email subscriptions")
parser.add_argument("--dry-run", action="store_true", help="Print report without sending email")
parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)
# Gather data from all accounts
all_data = []
for account in ACCOUNTS:
account_name = account["name"]
log.info("Reading data from %s", account_name)
entries = get_subscription_senders(account["db"])
for entry in entries:
entry["account"] = account_name
all_data.extend(entries)
log.info(" Found %d subscription-related entries", len(entries))
if not all_data:
log.info("No subscription data found across any account.")
if args.dry_run:
print("No subscription data found.")
return
# Aggregate by sender
senders = aggregate_senders(all_data)
log.info("Aggregated %d unique senders/domains", len(senders))
# Classify
classified = classify_senders(senders)
# Build report
text_body, html_body = build_report(classified)
if args.dry_run:
print(text_body)
return
now = datetime.now(tz=ZoneInfo("America/Los_Angeles"))
subject = f"Monthly Subscription Audit — {now.strftime('%B %Y')}"
send_email(subject=subject, html_body=html_body, text_body=text_body)
log.info("Subscription audit report emailed")
if __name__ == "__main__":
main()