Sanitized mirror from private repository - 2026-04-20 01:32:01 UTC
This commit is contained in:
382
scripts/subscription-auditor.py
Normal file
382
scripts/subscription-auditor.py
Normal file
@@ -0,0 +1,382 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Feature 16 — Subscription Auditor.
|
||||
|
||||
Monthly audit of email subscriptions across all organizer accounts.
|
||||
Classifies senders, identifies dormant subscriptions, and sends a report.
|
||||
|
||||
Cron: 0 9 1 * * cd /home/homelab/organized/repos/homelab && python3 scripts/subscription-auditor.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sqlite3
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from lib.ollama import ollama_generate, ollama_available, OllamaUnavailableError
|
||||
from lib.notify import send_email
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SCRIPTS_DIR = Path("/home/homelab/organized/repos/homelab/scripts")
|
||||
DORMANT_DAYS = 90
|
||||
|
||||
ACCOUNTS = [
|
||||
{
|
||||
"name": "lzbellina92@gmail.com",
|
||||
"db": SCRIPTS_DIR / "gmail-organizer" / "processed.db",
|
||||
},
|
||||
{
|
||||
"name": "your-email@example.com",
|
||||
"db": SCRIPTS_DIR / "gmail-organizer-dvish" / "processed.db",
|
||||
},
|
||||
{
|
||||
"name": "admin@thevish.io",
|
||||
"db": SCRIPTS_DIR / "proton-organizer" / "processed.db",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ── data gathering ───────────────────────────────────────────────────────────
|
||||
|
||||
def get_subscription_senders(db_path: Path) -> list[dict]:
|
||||
"""Query processed.db for accounts and newsletters categories, plus sender_cache."""
|
||||
if not db_path.exists():
|
||||
log.warning("DB not found: %s", db_path)
|
||||
return []
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
results = []
|
||||
|
||||
# Get processed entries for accounts and newsletters
|
||||
rows = conn.execute(
|
||||
"SELECT message_id, category, processed_at FROM processed "
|
||||
"WHERE category IN ('accounts', 'newsletters') "
|
||||
"ORDER BY processed_at DESC"
|
||||
).fetchall()
|
||||
|
||||
for msg_id, category, processed_at in rows:
|
||||
results.append({
|
||||
"message_id": msg_id,
|
||||
"category": category,
|
||||
"processed_at": processed_at,
|
||||
"source": "processed",
|
||||
})
|
||||
|
||||
# Also pull from sender_cache if it exists
|
||||
try:
|
||||
cache_rows = conn.execute(
|
||||
"SELECT sender, category, hit_count, last_seen FROM sender_cache "
|
||||
"WHERE category IN ('accounts', 'newsletters') "
|
||||
"ORDER BY hit_count DESC"
|
||||
).fetchall()
|
||||
for sender, category, hit_count, last_seen in cache_rows:
|
||||
results.append({
|
||||
"sender": sender,
|
||||
"category": category,
|
||||
"hit_count": hit_count,
|
||||
"last_seen": last_seen,
|
||||
"source": "sender_cache",
|
||||
})
|
||||
except sqlite3.OperationalError:
|
||||
log.debug("No sender_cache table in %s", db_path)
|
||||
|
||||
conn.close()
|
||||
return results
|
||||
|
||||
|
||||
def aggregate_senders(all_data: list[dict]) -> dict:
|
||||
"""Aggregate sender data. Returns dict of sender -> stats."""
|
||||
senders = defaultdict(lambda: {
|
||||
"count": 0,
|
||||
"last_seen": "",
|
||||
"categories": set(),
|
||||
"accounts": set(),
|
||||
})
|
||||
|
||||
for entry in all_data:
|
||||
account_name = entry.get("account", "unknown")
|
||||
|
||||
if entry.get("source") == "sender_cache":
|
||||
sender = entry.get("sender", "unknown")
|
||||
info = senders[sender]
|
||||
info["count"] += entry.get("hit_count", 1)
|
||||
if entry.get("last_seen", "") > info["last_seen"]:
|
||||
info["last_seen"] = entry["last_seen"]
|
||||
info["categories"].add(entry.get("category", ""))
|
||||
info["accounts"].add(account_name)
|
||||
else:
|
||||
# For processed entries, extract sender domain from message_id as proxy
|
||||
msg_id = entry.get("message_id", "")
|
||||
# Message-IDs often contain sender domain: <xxx@domain.com>
|
||||
sender = _extract_domain_from_message_id(msg_id)
|
||||
info = senders[sender]
|
||||
info["count"] += 1
|
||||
if entry.get("processed_at", "") > info["last_seen"]:
|
||||
info["last_seen"] = entry["processed_at"]
|
||||
info["categories"].add(entry.get("category", ""))
|
||||
info["accounts"].add(account_name)
|
||||
|
||||
return dict(senders)
|
||||
|
||||
|
||||
def _extract_domain_from_message_id(msg_id: str) -> str:
|
||||
"""Extract domain from Message-ID for sender approximation."""
|
||||
# Message-IDs like <abc123@mail.example.com>
|
||||
if "@" in msg_id:
|
||||
domain = msg_id.split("@")[-1].strip(">").strip()
|
||||
return domain
|
||||
return msg_id[:30] if msg_id else "unknown"
|
||||
|
||||
|
||||
# ── classification ───────────────────────────────────────────────────────────
|
||||
|
||||
def classify_senders(senders: dict) -> dict:
|
||||
"""Classify senders as dormant, active_subscription, one_time_notification, or marketing."""
|
||||
cutoff = (datetime.now(tz=ZoneInfo("UTC")) - timedelta(days=DORMANT_DAYS)).isoformat()
|
||||
|
||||
dormant = {}
|
||||
to_classify = {}
|
||||
|
||||
for sender, info in senders.items():
|
||||
if info["last_seen"] and info["last_seen"] < cutoff:
|
||||
info["classification"] = "dormant"
|
||||
dormant[sender] = info
|
||||
else:
|
||||
to_classify[sender] = info
|
||||
|
||||
# Use LLM to classify non-dormant senders
|
||||
if to_classify and ollama_available():
|
||||
classified = _llm_classify(to_classify)
|
||||
for sender, cls in classified.items():
|
||||
if sender in to_classify:
|
||||
to_classify[sender]["classification"] = cls
|
||||
# Any unclassified get "unknown"
|
||||
for sender, info in to_classify.items():
|
||||
if "classification" not in info:
|
||||
info["classification"] = "unknown"
|
||||
else:
|
||||
if to_classify and not ollama_available():
|
||||
log.warning("Ollama not available, marking all non-dormant as 'unknown'")
|
||||
for sender, info in to_classify.items():
|
||||
info["classification"] = "unknown"
|
||||
|
||||
all_classified = {}
|
||||
all_classified.update(dormant)
|
||||
all_classified.update(to_classify)
|
||||
return all_classified
|
||||
|
||||
|
||||
def _llm_classify(senders: dict) -> dict:
|
||||
"""Classify senders via LLM in batches."""
|
||||
results = {}
|
||||
sender_list = list(senders.items())
|
||||
|
||||
# Process in batches of 30
|
||||
batch_size = 30
|
||||
for i in range(0, len(sender_list), batch_size):
|
||||
batch = sender_list[i:i + batch_size]
|
||||
sender_lines = []
|
||||
for sender, info in batch:
|
||||
cats = ", ".join(info["categories"])
|
||||
sender_lines.append(f"{sender} (count={info['count']}, categories={cats})")
|
||||
|
||||
prompt = (
|
||||
"Classify each email sender/domain as: active_subscription, "
|
||||
"one_time_notification, or marketing.\n"
|
||||
"Format: one line per sender, exactly: SENDER: CLASSIFICATION\n"
|
||||
"No extra text.\n\n"
|
||||
+ "\n".join(sender_lines)
|
||||
)
|
||||
|
||||
try:
|
||||
response = ollama_generate(prompt, temperature=0.1, num_predict=2000)
|
||||
for line in response.strip().split("\n"):
|
||||
line = line.strip()
|
||||
if ":" not in line:
|
||||
continue
|
||||
parts = line.rsplit(":", 1)
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
s = parts[0].strip()
|
||||
c = parts[1].strip().lower().replace(" ", "_")
|
||||
# Match sender back to our keys
|
||||
for sender, _ in batch:
|
||||
if sender in s or s in sender:
|
||||
if c in ("active_subscription", "one_time_notification", "marketing"):
|
||||
results[sender] = c
|
||||
break
|
||||
except OllamaUnavailableError as e:
|
||||
log.warning("LLM classification failed: %s", e)
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ── report ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def build_report(classified: dict) -> tuple[str, str]:
|
||||
"""Build text and HTML subscription audit report."""
|
||||
now = datetime.now(tz=ZoneInfo("America/Los_Angeles"))
|
||||
|
||||
# Group by classification
|
||||
groups = defaultdict(list)
|
||||
for sender, info in classified.items():
|
||||
groups[info.get("classification", "unknown")].append((sender, info))
|
||||
|
||||
# Sort each group by count descending
|
||||
for cls in groups:
|
||||
groups[cls].sort(key=lambda x: -x[1]["count"])
|
||||
|
||||
# Stats
|
||||
total_senders = len(classified)
|
||||
active_count = len(groups.get("active_subscription", []))
|
||||
dormant_count = len(groups.get("dormant", []))
|
||||
marketing_count = len(groups.get("marketing", []))
|
||||
one_time_count = len(groups.get("one_time_notification", []))
|
||||
unknown_count = len(groups.get("unknown", []))
|
||||
|
||||
# Text report
|
||||
text_lines = [
|
||||
f"Monthly Subscription Audit — {now.strftime('%Y-%m-%d %H:%M %Z')}",
|
||||
"=" * 60,
|
||||
"",
|
||||
f"Total unique senders/domains: {total_senders}",
|
||||
f"Active subscriptions: {active_count}",
|
||||
f"Marketing/promotional: {marketing_count}",
|
||||
f"One-time notifications: {one_time_count}",
|
||||
f"Dormant (>{DORMANT_DAYS} days): {dormant_count}",
|
||||
f"Unclassified: {unknown_count}",
|
||||
"",
|
||||
]
|
||||
|
||||
# Classification order for display
|
||||
display_order = [
|
||||
("active_subscription", "Active Subscriptions"),
|
||||
("marketing", "Marketing / Promotional"),
|
||||
("dormant", "Dormant (Potential Unsubscribes)"),
|
||||
("one_time_notification", "One-Time Notifications"),
|
||||
("unknown", "Unclassified"),
|
||||
]
|
||||
|
||||
for cls_key, cls_label in display_order:
|
||||
items = groups.get(cls_key, [])
|
||||
if not items:
|
||||
continue
|
||||
text_lines.append(f"--- {cls_label} ({len(items)}) ---")
|
||||
for sender, info in items:
|
||||
cats = ", ".join(info["categories"])
|
||||
accts = ", ".join(info["accounts"])
|
||||
text_lines.append(
|
||||
f" {sender:40s} count={info['count']:>4} last={info['last_seen'][:10]} "
|
||||
f"cats=[{cats}] accts=[{accts}]"
|
||||
)
|
||||
text_lines.append("")
|
||||
|
||||
text_body = "\n".join(text_lines)
|
||||
|
||||
# HTML report
|
||||
html_parts = [
|
||||
"<html><body>",
|
||||
"<h2>Monthly Subscription Audit</h2>",
|
||||
f"<p>{now.strftime('%Y-%m-%d %H:%M %Z')}</p>",
|
||||
"<h3>Summary</h3>",
|
||||
"<table border='1' cellpadding='6' cellspacing='0' style='border-collapse:collapse;'>",
|
||||
f"<tr><td>Total unique senders</td><td><b>{total_senders}</b></td></tr>",
|
||||
f"<tr><td>Active subscriptions</td><td>{active_count}</td></tr>",
|
||||
f"<tr><td>Marketing/promotional</td><td>{marketing_count}</td></tr>",
|
||||
f"<tr><td>One-time notifications</td><td>{one_time_count}</td></tr>",
|
||||
f"<tr><td>Dormant (>{DORMANT_DAYS} days)</td><td style='color:orange;'>{dormant_count}</td></tr>",
|
||||
f"<tr><td>Unclassified</td><td>{unknown_count}</td></tr>",
|
||||
"</table>",
|
||||
]
|
||||
|
||||
for cls_key, cls_label in display_order:
|
||||
items = groups.get(cls_key, [])
|
||||
if not items:
|
||||
continue
|
||||
html_parts.append(f"<h3>{cls_label} ({len(items)})</h3>")
|
||||
html_parts.append(
|
||||
"<table border='1' cellpadding='4' cellspacing='0' style='border-collapse:collapse; font-size:13px;'>"
|
||||
)
|
||||
html_parts.append(
|
||||
"<tr><th>Sender/Domain</th><th>Count</th><th>Last Seen</th>"
|
||||
"<th>Categories</th><th>Accounts</th></tr>"
|
||||
)
|
||||
for sender, info in items:
|
||||
cats = ", ".join(info["categories"])
|
||||
accts = ", ".join(info["accounts"])
|
||||
last = info["last_seen"][:10] if info["last_seen"] else "N/A"
|
||||
html_parts.append(
|
||||
f"<tr><td>{sender}</td><td>{info['count']}</td>"
|
||||
f"<td>{last}</td><td>{cats}</td><td>{accts}</td></tr>"
|
||||
)
|
||||
html_parts.append("</table>")
|
||||
|
||||
html_parts.append("<hr>")
|
||||
html_parts.append(
|
||||
f"<p><small>Generated {now.strftime('%Y-%m-%d %H:%M %Z')} by subscription-auditor.py</small></p>"
|
||||
)
|
||||
html_parts.append("</body></html>")
|
||||
|
||||
html_body = "\n".join(html_parts)
|
||||
|
||||
return text_body, html_body
|
||||
|
||||
|
||||
# ── main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Monthly Subscription Auditor — analyze email subscriptions")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print report without sending email")
|
||||
parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.verbose else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
# Gather data from all accounts
|
||||
all_data = []
|
||||
for account in ACCOUNTS:
|
||||
account_name = account["name"]
|
||||
log.info("Reading data from %s", account_name)
|
||||
entries = get_subscription_senders(account["db"])
|
||||
for entry in entries:
|
||||
entry["account"] = account_name
|
||||
all_data.extend(entries)
|
||||
log.info(" Found %d subscription-related entries", len(entries))
|
||||
|
||||
if not all_data:
|
||||
log.info("No subscription data found across any account.")
|
||||
if args.dry_run:
|
||||
print("No subscription data found.")
|
||||
return
|
||||
|
||||
# Aggregate by sender
|
||||
senders = aggregate_senders(all_data)
|
||||
log.info("Aggregated %d unique senders/domains", len(senders))
|
||||
|
||||
# Classify
|
||||
classified = classify_senders(senders)
|
||||
|
||||
# Build report
|
||||
text_body, html_body = build_report(classified)
|
||||
|
||||
if args.dry_run:
|
||||
print(text_body)
|
||||
return
|
||||
|
||||
now = datetime.now(tz=ZoneInfo("America/Los_Angeles"))
|
||||
subject = f"Monthly Subscription Audit — {now.strftime('%B %Y')}"
|
||||
send_email(subject=subject, html_body=html_body, text_body=text_body)
|
||||
log.info("Subscription audit report emailed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user