383 lines
14 KiB
Python
383 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""Feature 16 — Subscription Auditor.
|
|
|
|
Monthly audit of email subscriptions across all organizer accounts.
|
|
Classifies senders, identifies dormant subscriptions, and sends a report.
|
|
|
|
Cron: 0 9 1 * * cd /home/homelab/organized/repos/homelab && python3 scripts/subscription-auditor.py
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import sqlite3
|
|
import sys
|
|
from collections import defaultdict
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from zoneinfo import ZoneInfo
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from lib.ollama import ollama_generate, ollama_available, OllamaUnavailableError
|
|
from lib.notify import send_email
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
SCRIPTS_DIR = Path("/home/homelab/organized/repos/homelab/scripts")
|
|
DORMANT_DAYS = 90
|
|
|
|
ACCOUNTS = [
|
|
{
|
|
"name": "lzbellina92@gmail.com",
|
|
"db": SCRIPTS_DIR / "gmail-organizer" / "processed.db",
|
|
},
|
|
{
|
|
"name": "your-email@example.com",
|
|
"db": SCRIPTS_DIR / "gmail-organizer-dvish" / "processed.db",
|
|
},
|
|
{
|
|
"name": "admin@thevish.io",
|
|
"db": SCRIPTS_DIR / "proton-organizer" / "processed.db",
|
|
},
|
|
]
|
|
|
|
|
|
# ── data gathering ───────────────────────────────────────────────────────────
|
|
|
|
def get_subscription_senders(db_path: Path) -> list[dict]:
|
|
"""Query processed.db for accounts and newsletters categories, plus sender_cache."""
|
|
if not db_path.exists():
|
|
log.warning("DB not found: %s", db_path)
|
|
return []
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
results = []
|
|
|
|
# Get processed entries for accounts and newsletters
|
|
rows = conn.execute(
|
|
"SELECT message_id, category, processed_at FROM processed "
|
|
"WHERE category IN ('accounts', 'newsletters') "
|
|
"ORDER BY processed_at DESC"
|
|
).fetchall()
|
|
|
|
for msg_id, category, processed_at in rows:
|
|
results.append({
|
|
"message_id": msg_id,
|
|
"category": category,
|
|
"processed_at": processed_at,
|
|
"source": "processed",
|
|
})
|
|
|
|
# Also pull from sender_cache if it exists
|
|
try:
|
|
cache_rows = conn.execute(
|
|
"SELECT sender, category, hit_count, last_seen FROM sender_cache "
|
|
"WHERE category IN ('accounts', 'newsletters') "
|
|
"ORDER BY hit_count DESC"
|
|
).fetchall()
|
|
for sender, category, hit_count, last_seen in cache_rows:
|
|
results.append({
|
|
"sender": sender,
|
|
"category": category,
|
|
"hit_count": hit_count,
|
|
"last_seen": last_seen,
|
|
"source": "sender_cache",
|
|
})
|
|
except sqlite3.OperationalError:
|
|
log.debug("No sender_cache table in %s", db_path)
|
|
|
|
conn.close()
|
|
return results
|
|
|
|
|
|
def aggregate_senders(all_data: list[dict]) -> dict:
|
|
"""Aggregate sender data. Returns dict of sender -> stats."""
|
|
senders = defaultdict(lambda: {
|
|
"count": 0,
|
|
"last_seen": "",
|
|
"categories": set(),
|
|
"accounts": set(),
|
|
})
|
|
|
|
for entry in all_data:
|
|
account_name = entry.get("account", "unknown")
|
|
|
|
if entry.get("source") == "sender_cache":
|
|
sender = entry.get("sender", "unknown")
|
|
info = senders[sender]
|
|
info["count"] += entry.get("hit_count", 1)
|
|
if entry.get("last_seen", "") > info["last_seen"]:
|
|
info["last_seen"] = entry["last_seen"]
|
|
info["categories"].add(entry.get("category", ""))
|
|
info["accounts"].add(account_name)
|
|
else:
|
|
# For processed entries, extract sender domain from message_id as proxy
|
|
msg_id = entry.get("message_id", "")
|
|
# Message-IDs often contain sender domain: <xxx@domain.com>
|
|
sender = _extract_domain_from_message_id(msg_id)
|
|
info = senders[sender]
|
|
info["count"] += 1
|
|
if entry.get("processed_at", "") > info["last_seen"]:
|
|
info["last_seen"] = entry["processed_at"]
|
|
info["categories"].add(entry.get("category", ""))
|
|
info["accounts"].add(account_name)
|
|
|
|
return dict(senders)
|
|
|
|
|
|
def _extract_domain_from_message_id(msg_id: str) -> str:
|
|
"""Extract domain from Message-ID for sender approximation."""
|
|
# Message-IDs like <abc123@mail.example.com>
|
|
if "@" in msg_id:
|
|
domain = msg_id.split("@")[-1].strip(">").strip()
|
|
return domain
|
|
return msg_id[:30] if msg_id else "unknown"
|
|
|
|
|
|
# ── classification ───────────────────────────────────────────────────────────
|
|
|
|
def classify_senders(senders: dict) -> dict:
|
|
"""Classify senders as dormant, active_subscription, one_time_notification, or marketing."""
|
|
cutoff = (datetime.now(tz=ZoneInfo("UTC")) - timedelta(days=DORMANT_DAYS)).isoformat()
|
|
|
|
dormant = {}
|
|
to_classify = {}
|
|
|
|
for sender, info in senders.items():
|
|
if info["last_seen"] and info["last_seen"] < cutoff:
|
|
info["classification"] = "dormant"
|
|
dormant[sender] = info
|
|
else:
|
|
to_classify[sender] = info
|
|
|
|
# Use LLM to classify non-dormant senders
|
|
if to_classify and ollama_available():
|
|
classified = _llm_classify(to_classify)
|
|
for sender, cls in classified.items():
|
|
if sender in to_classify:
|
|
to_classify[sender]["classification"] = cls
|
|
# Any unclassified get "unknown"
|
|
for sender, info in to_classify.items():
|
|
if "classification" not in info:
|
|
info["classification"] = "unknown"
|
|
else:
|
|
if to_classify and not ollama_available():
|
|
log.warning("Ollama not available, marking all non-dormant as 'unknown'")
|
|
for sender, info in to_classify.items():
|
|
info["classification"] = "unknown"
|
|
|
|
all_classified = {}
|
|
all_classified.update(dormant)
|
|
all_classified.update(to_classify)
|
|
return all_classified
|
|
|
|
|
|
def _llm_classify(senders: dict) -> dict:
|
|
"""Classify senders via LLM in batches."""
|
|
results = {}
|
|
sender_list = list(senders.items())
|
|
|
|
# Process in batches of 30
|
|
batch_size = 30
|
|
for i in range(0, len(sender_list), batch_size):
|
|
batch = sender_list[i:i + batch_size]
|
|
sender_lines = []
|
|
for sender, info in batch:
|
|
cats = ", ".join(info["categories"])
|
|
sender_lines.append(f"{sender} (count={info['count']}, categories={cats})")
|
|
|
|
prompt = (
|
|
"Classify each email sender/domain as: active_subscription, "
|
|
"one_time_notification, or marketing.\n"
|
|
"Format: one line per sender, exactly: SENDER: CLASSIFICATION\n"
|
|
"No extra text.\n\n"
|
|
+ "\n".join(sender_lines)
|
|
)
|
|
|
|
try:
|
|
response = ollama_generate(prompt, temperature=0.1, num_predict=2000)
|
|
for line in response.strip().split("\n"):
|
|
line = line.strip()
|
|
if ":" not in line:
|
|
continue
|
|
parts = line.rsplit(":", 1)
|
|
if len(parts) != 2:
|
|
continue
|
|
s = parts[0].strip()
|
|
c = parts[1].strip().lower().replace(" ", "_")
|
|
# Match sender back to our keys
|
|
for sender, _ in batch:
|
|
if sender in s or s in sender:
|
|
if c in ("active_subscription", "one_time_notification", "marketing"):
|
|
results[sender] = c
|
|
break
|
|
except OllamaUnavailableError as e:
|
|
log.warning("LLM classification failed: %s", e)
|
|
break
|
|
|
|
return results
|
|
|
|
|
|
# ── report ───────────────────────────────────────────────────────────────────
|
|
|
|
def build_report(classified: dict) -> tuple[str, str]:
|
|
"""Build text and HTML subscription audit report."""
|
|
now = datetime.now(tz=ZoneInfo("America/Los_Angeles"))
|
|
|
|
# Group by classification
|
|
groups = defaultdict(list)
|
|
for sender, info in classified.items():
|
|
groups[info.get("classification", "unknown")].append((sender, info))
|
|
|
|
# Sort each group by count descending
|
|
for cls in groups:
|
|
groups[cls].sort(key=lambda x: -x[1]["count"])
|
|
|
|
# Stats
|
|
total_senders = len(classified)
|
|
active_count = len(groups.get("active_subscription", []))
|
|
dormant_count = len(groups.get("dormant", []))
|
|
marketing_count = len(groups.get("marketing", []))
|
|
one_time_count = len(groups.get("one_time_notification", []))
|
|
unknown_count = len(groups.get("unknown", []))
|
|
|
|
# Text report
|
|
text_lines = [
|
|
f"Monthly Subscription Audit — {now.strftime('%Y-%m-%d %H:%M %Z')}",
|
|
"=" * 60,
|
|
"",
|
|
f"Total unique senders/domains: {total_senders}",
|
|
f"Active subscriptions: {active_count}",
|
|
f"Marketing/promotional: {marketing_count}",
|
|
f"One-time notifications: {one_time_count}",
|
|
f"Dormant (>{DORMANT_DAYS} days): {dormant_count}",
|
|
f"Unclassified: {unknown_count}",
|
|
"",
|
|
]
|
|
|
|
# Classification order for display
|
|
display_order = [
|
|
("active_subscription", "Active Subscriptions"),
|
|
("marketing", "Marketing / Promotional"),
|
|
("dormant", "Dormant (Potential Unsubscribes)"),
|
|
("one_time_notification", "One-Time Notifications"),
|
|
("unknown", "Unclassified"),
|
|
]
|
|
|
|
for cls_key, cls_label in display_order:
|
|
items = groups.get(cls_key, [])
|
|
if not items:
|
|
continue
|
|
text_lines.append(f"--- {cls_label} ({len(items)}) ---")
|
|
for sender, info in items:
|
|
cats = ", ".join(info["categories"])
|
|
accts = ", ".join(info["accounts"])
|
|
text_lines.append(
|
|
f" {sender:40s} count={info['count']:>4} last={info['last_seen'][:10]} "
|
|
f"cats=[{cats}] accts=[{accts}]"
|
|
)
|
|
text_lines.append("")
|
|
|
|
text_body = "\n".join(text_lines)
|
|
|
|
# HTML report
|
|
html_parts = [
|
|
"<html><body>",
|
|
"<h2>Monthly Subscription Audit</h2>",
|
|
f"<p>{now.strftime('%Y-%m-%d %H:%M %Z')}</p>",
|
|
"<h3>Summary</h3>",
|
|
"<table border='1' cellpadding='6' cellspacing='0' style='border-collapse:collapse;'>",
|
|
f"<tr><td>Total unique senders</td><td><b>{total_senders}</b></td></tr>",
|
|
f"<tr><td>Active subscriptions</td><td>{active_count}</td></tr>",
|
|
f"<tr><td>Marketing/promotional</td><td>{marketing_count}</td></tr>",
|
|
f"<tr><td>One-time notifications</td><td>{one_time_count}</td></tr>",
|
|
f"<tr><td>Dormant (>{DORMANT_DAYS} days)</td><td style='color:orange;'>{dormant_count}</td></tr>",
|
|
f"<tr><td>Unclassified</td><td>{unknown_count}</td></tr>",
|
|
"</table>",
|
|
]
|
|
|
|
for cls_key, cls_label in display_order:
|
|
items = groups.get(cls_key, [])
|
|
if not items:
|
|
continue
|
|
html_parts.append(f"<h3>{cls_label} ({len(items)})</h3>")
|
|
html_parts.append(
|
|
"<table border='1' cellpadding='4' cellspacing='0' style='border-collapse:collapse; font-size:13px;'>"
|
|
)
|
|
html_parts.append(
|
|
"<tr><th>Sender/Domain</th><th>Count</th><th>Last Seen</th>"
|
|
"<th>Categories</th><th>Accounts</th></tr>"
|
|
)
|
|
for sender, info in items:
|
|
cats = ", ".join(info["categories"])
|
|
accts = ", ".join(info["accounts"])
|
|
last = info["last_seen"][:10] if info["last_seen"] else "N/A"
|
|
html_parts.append(
|
|
f"<tr><td>{sender}</td><td>{info['count']}</td>"
|
|
f"<td>{last}</td><td>{cats}</td><td>{accts}</td></tr>"
|
|
)
|
|
html_parts.append("</table>")
|
|
|
|
html_parts.append("<hr>")
|
|
html_parts.append(
|
|
f"<p><small>Generated {now.strftime('%Y-%m-%d %H:%M %Z')} by subscription-auditor.py</small></p>"
|
|
)
|
|
html_parts.append("</body></html>")
|
|
|
|
html_body = "\n".join(html_parts)
|
|
|
|
return text_body, html_body
|
|
|
|
|
|
# ── main ─────────────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Monthly Subscription Auditor — analyze email subscriptions")
|
|
parser.add_argument("--dry-run", action="store_true", help="Print report without sending email")
|
|
parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
|
)
|
|
|
|
# Gather data from all accounts
|
|
all_data = []
|
|
for account in ACCOUNTS:
|
|
account_name = account["name"]
|
|
log.info("Reading data from %s", account_name)
|
|
entries = get_subscription_senders(account["db"])
|
|
for entry in entries:
|
|
entry["account"] = account_name
|
|
all_data.extend(entries)
|
|
log.info(" Found %d subscription-related entries", len(entries))
|
|
|
|
if not all_data:
|
|
log.info("No subscription data found across any account.")
|
|
if args.dry_run:
|
|
print("No subscription data found.")
|
|
return
|
|
|
|
# Aggregate by sender
|
|
senders = aggregate_senders(all_data)
|
|
log.info("Aggregated %d unique senders/domains", len(senders))
|
|
|
|
# Classify
|
|
classified = classify_senders(senders)
|
|
|
|
# Build report
|
|
text_body, html_body = build_report(classified)
|
|
|
|
if args.dry_run:
|
|
print(text_body)
|
|
return
|
|
|
|
now = datetime.now(tz=ZoneInfo("America/Los_Angeles"))
|
|
subject = f"Monthly Subscription Audit — {now.strftime('%B %Y')}"
|
|
send_email(subject=subject, html_body=html_body, text_body=text_body)
|
|
log.info("Subscription audit report emailed")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|