Files
homelab-optimized/scripts/gmail-organizer-dvish/gmail_organizer.py
Gitea Mirror Bot 8a947d9e36
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m3s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-04 03:23:14 UTC
2026-04-04 03:23:14 +00:00

607 lines
22 KiB
Python

#!/usr/bin/env python3
"""Gmail Organizer — classifies emails using a local LLM and applies Gmail labels."""
import argparse
import email
import email.header
import html
import imaplib
import json
import logging
import re
import sqlite3
import sys
import time
import urllib.request
import urllib.error
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
import yaml
LOG_FMT = "%(asctime)s %(levelname)-8s %(message)s"
log = logging.getLogger("gmail-organizer")
DB_PATH = Path(__file__).parent / "processed.db"
DEFAULT_CONFIG = Path(__file__).parent / "config.local.yaml"
# ── helpers ──────────────────────────────────────────────────────────────────
def load_config(path: Path) -> dict:
with open(path) as f:
return yaml.safe_load(f)
def init_db(db_path: Path) -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS processed (
message_id TEXT PRIMARY KEY,
category TEXT NOT NULL,
processed_at TEXT NOT NULL
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS sender_cache (
sender TEXT PRIMARY KEY,
category TEXT NOT NULL,
hit_count INTEGER DEFAULT 1,
last_seen TEXT NOT NULL
)
""")
conn.commit()
return conn
def is_processed(conn: sqlite3.Connection, message_id: str) -> bool:
row = conn.execute(
"SELECT 1 FROM processed WHERE message_id = ?", (message_id,)
).fetchone()
return row is not None
def mark_processed(conn: sqlite3.Connection, message_id: str, category: str):
conn.execute(
"INSERT OR REPLACE INTO processed (message_id, category, processed_at) VALUES (?, ?, ?)",
(message_id, category, datetime.now(tz=ZoneInfo("UTC")).isoformat()),
)
conn.commit()
def get_cached_sender(conn: sqlite3.Connection, sender: str, min_hits: int = 3) -> str | None:
"""Return cached category if sender has been classified the same way min_hits times."""
row = conn.execute(
"SELECT category, hit_count FROM sender_cache WHERE sender = ? AND hit_count >= ?",
(sender, min_hits),
).fetchone()
return row[0] if row else None
def update_sender_cache(conn: sqlite3.Connection, sender: str, category: str):
"""Update sender cache. Increment if same category, reset if different."""
row = conn.execute(
"SELECT category FROM sender_cache WHERE sender = ?", (sender,)
).fetchone()
now = datetime.now(tz=ZoneInfo("UTC")).isoformat()
if row and row[0] == category:
conn.execute(
"UPDATE sender_cache SET hit_count = hit_count + 1, last_seen = ? WHERE sender = ?",
(now, sender),
)
else:
conn.execute(
"INSERT OR REPLACE INTO sender_cache (sender, category, hit_count, last_seen) VALUES (?, ?, 1, ?)",
(sender, category, now),
)
conn.commit()
def decode_header(raw: str | None) -> str:
if not raw:
return ""
parts = email.header.decode_header(raw)
decoded = []
for data, charset in parts:
if isinstance(data, bytes):
decoded.append(data.decode(charset or "utf-8", errors="replace"))
else:
decoded.append(data)
return " ".join(decoded)
def extract_email_address(from_header: str) -> str:
"""Extract just the email address from a From header."""
match = re.search(r'<([^>]+)>', from_header)
return match.group(1).lower() if match else from_header.strip().lower()
def extract_text(msg: email.message.Message, max_chars: int) -> str:
"""Extract plain-text body from an email, falling back to stripped HTML."""
body = ""
if msg.is_multipart():
for part in msg.walk():
ct = part.get_content_type()
if ct == "text/plain":
payload = part.get_payload(decode=True)
if payload:
charset = part.get_content_charset() or "utf-8"
body = payload.decode(charset, errors="replace")
break
elif ct == "text/html" and not body:
payload = part.get_payload(decode=True)
if payload:
charset = part.get_content_charset() or "utf-8"
raw_html = payload.decode(charset, errors="replace")
body = html.unescape(re.sub(r"<[^>]+>", " ", raw_html))
else:
payload = msg.get_payload(decode=True)
if payload:
charset = msg.get_content_charset() or "utf-8"
body = payload.decode(charset, errors="replace")
if msg.get_content_type() == "text/html":
body = html.unescape(re.sub(r"<[^>]+>", " ", body))
body = re.sub(r"\s+", " ", body).strip()
return body[:max_chars]
# ── Gmail IMAP ───────────────────────────────────────────────────────────────
class GmailClient:
def __init__(self, email_addr: str, app_password: "REDACTED_PASSWORD"
self.email = email_addr
self.conn = imaplib.IMAP4_SSL("imap.gmail.com")
self.conn.login(email_addr, app_password)
def fetch_uids(self, mailbox: str = "INBOX", search: str = "ALL",
batch_size: int = 50) -> list[bytes]:
self.conn.select(mailbox)
_, data = self.conn.search(None, search)
uids = data[0].split()
return list(reversed(uids[-batch_size:]))
def fetch_message(self, uid: bytes) -> email.message.Message:
_, data = self.conn.fetch(uid, "(RFC822)")
return email.message_from_bytes(data[0][1])
def get_labels(self, uid: bytes) -> list[str]:
"""Get existing Gmail labels (X-GM-LABELS) for a message."""
_, data = self.conn.fetch(uid, "(X-GM-LABELS)")
raw = data[0].decode() if isinstance(data[0], bytes) else str(data[0])
match = re.search(r'X-GM-LABELS \(([^)]*)\)', raw)
if match:
return match.group(1).split()
return []
def apply_label(self, uid: bytes, label: str):
"""Apply a Gmail label to a message. Creates the label if needed."""
result = self.conn.store(uid, "+X-GM-LABELS", f'("{label}")')
if result[0] != "OK":
try:
self.conn.create(label)
except imaplib.IMAP4.error:
pass
self.conn.copy(uid, label)
def archive(self, uid: bytes):
"""Archive a message (remove from INBOX)."""
self.conn.store(uid, "-X-GM-LABELS", '("\\\\Inbox")')
def star(self, uid: bytes):
"""Star/flag a message for priority."""
self.conn.store(uid, "+FLAGS", "\\Flagged")
def close(self):
try:
self.conn.close()
self.conn.logout()
except Exception:
pass
# ── Ollama LLM ───────────────────────────────────────────────────────────────
def _ollama_request(ollama_url: str, payload: dict, max_retries: int = 3) -> dict:
"""Make an Ollama API request with retry and exponential backoff."""
data = json.dumps(payload).encode()
req = urllib.request.Request(
f"{ollama_url.rstrip('/')}/api/generate",
data=data,
headers={"Content-Type": "application/json"},
)
last_error = None
for attempt in range(max_retries):
try:
with urllib.request.urlopen(req, timeout=60) as resp:
return json.loads(resp.read())
except (urllib.error.URLError, TimeoutError) as e:
last_error = e
if attempt < max_retries - 1:
wait = 2 ** attempt
log.warning("Ollama attempt %d/%d failed: %s — retrying in %ds",
attempt + 1, max_retries, e, wait)
time.sleep(wait)
raise last_error
def classify_email(
ollama_url: str,
model: str,
categories: dict,
subject: str,
sender: str,
body_snippet: str,
) -> tuple[str, bool]:
"""Classify an email. Returns (category, is_confident)."""
cat_descriptions = "\n".join(
f"- **{name}**: {info['description']}" for name, info in categories.items()
)
category_names = ", ".join(categories.keys())
prompt = f"""Classify this email into exactly ONE category.
Reply with: CATEGORY CONFIDENCE
Where confidence is high, medium, or low.
Categories:
{cat_descriptions}
Email:
From: {sender}
Subject: {subject}
Body: {body_snippet[:1000]}
Reply with one of [{category_names}] followed by [high/medium/low]:"""
result = _ollama_request(ollama_url, {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 30},
})
raw = result.get("response", "").strip().lower()
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
is_confident = "low" not in raw
for name in categories:
if name in raw:
return name, is_confident
log.warning("LLM returned unexpected category %r, defaulting to 'personal'", raw)
return "personal", False
def classify_batch(
ollama_url: str,
model: str,
categories: dict,
emails: list[dict],
) -> list[tuple[str, bool]]:
"""Classify multiple emails in one LLM call."""
if len(emails) == 1:
e = emails[0]
return [classify_email(ollama_url, model, categories,
e["subject"], e["sender"], e["body"])]
cat_descriptions = "\n".join(
f"- **{name}**: {info['description']}" for name, info in categories.items()
)
category_names = ", ".join(categories.keys())
sections = []
for i, e in enumerate(emails, 1):
sections.append(
f"[Email {i}]\nFrom: {e['sender']}\nSubject: {e['subject']}\nBody: {e['body'][:500]}"
)
prompt = f"""Classify each email into ONE category.
Reply with one line per email: NUMBER: CATEGORY CONFIDENCE
Categories:
{cat_descriptions}
Valid: {category_names}
Confidence: high, medium, or low
{chr(10).join(sections)}
Reply:"""
result = _ollama_request(ollama_url, {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 20 * len(emails)},
})
raw = result.get("response", "").strip().lower()
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
results = []
lines = raw.split("\n")
for i, e in enumerate(emails):
found = False
for line in lines:
if str(i + 1) in line:
confident = "low" not in line
for name in categories:
if name in line:
results.append((name, confident))
found = True
break
if found:
break
if not found:
results.append(classify_email(
ollama_url, model, categories,
e["subject"], e["sender"], e["body"],
))
return results
# ── notifications ────────────────────────────────────────────────────────────
def send_notification(url: str, message: str, title: str = "Gmail Organizer"):
"""Send a notification via ntfy/webhook URL."""
if not url:
return
try:
req = urllib.request.Request(url, data=message.encode(), headers={
"Title": title, "Content-Type": "text/plain",
})
with urllib.request.urlopen(req, timeout=10):
pass
except Exception as e:
log.warning("Failed to send notification: %s", e)
# ── digest ───────────────────────────────────────────────────────────────────
def generate_digest(stats: dict, details: list[dict]) -> str:
"""Generate a summary of what was processed."""
lines = ["Gmail Organizer Summary", "=" * 25, ""]
classified = sum(v for k, v in stats.items()
if k not in ("skipped", "errors", "cached", "low_confidence"))
lines.append(f"Classified: {classified}")
lines.append(f"Cached (sender match): {stats.get('cached', 0)}")
lines.append(f"Skipped (already done): {stats.get('skipped', 0)}")
lines.append(f"Low confidence: {stats.get('low_confidence', 0)}")
lines.append(f"Errors: {stats.get('errors', 0)}")
if details:
lines.append("")
lines.append("Details:")
for d in details:
tag = " [cached]" if d.get("cached") else ""
lines.append(f" {d['category']:>12} | {d['subject'][:50]}{tag}")
return "\n".join(lines)
# ── main ─────────────────────────────────────────────────────────────────────
def _apply_result(
client, db, uid, message_id, subject, sender_email,
category, is_confident, categories, stats, details,
dry_run, use_confidence, priority_cats,
):
"""Apply classification: label, archive, star, record."""
if use_confidence and not is_confident:
log.info(" -> %s (low confidence, skipping)", category)
stats["low_confidence"] += 1
return
label = categories[category]["label"]
should_archive = categories[category].get("archive", False)
should_star = category in priority_cats
log.info(" -> %s (%s)", category, label)
if not dry_run:
client.apply_label(uid, label)
if should_archive:
client.archive(uid)
log.info(" archived")
if should_star:
client.star(uid)
log.info(" starred")
mark_processed(db, message_id, category)
else:
extras = []
if should_archive:
extras.append("archive")
if should_star:
extras.append("star")
log.info(" [DRY RUN] %s%s", label, (" + " + ", ".join(extras)) if extras else "")
stats[category] = stats.get(category, 0) + 1
details.append({"subject": subject, "category": category,
"sender": sender_email, "cached": False})
def run(config_path: Path, dry_run: bool = False, reprocess: bool = False,
limit: int | None = None, digest: bool = False):
cfg = load_config(config_path)
gmail_cfg = cfg["gmail"]
ollama_cfg = cfg["ollama"]
categories = cfg["categories"]
proc_cfg = cfg.get("processing", {})
notify_cfg = cfg.get("notifications", {})
batch_size = limit or proc_cfg.get("batch_size", 50)
max_body = proc_cfg.get("max_body_chars", 2000)
dry_run = dry_run or proc_cfg.get("dry_run", False)
mailbox = proc_cfg.get("mailbox", "INBOX")
use_confidence = proc_cfg.get("confidence_threshold", True)
sender_min_hits = proc_cfg.get("sender_cache_min_hits", 3)
use_batch = proc_cfg.get("batch_classify", False)
batch_sz = proc_cfg.get("batch_classify_size", 5)
priority_cats = proc_cfg.get("priority_star", [])
log.info("Connecting to Gmail as %s", gmail_cfg["email"])
client = GmailClient(gmail_cfg["email"], gmail_cfg["app_password"])
db = init_db(DB_PATH)
try:
uids = client.fetch_uids(mailbox=mailbox, batch_size=batch_size)
log.info("Fetched %d message UIDs", len(uids))
stats = {cat: 0 for cat in categories}
stats.update({"skipped": 0, "errors": 0, "cached": 0, "low_confidence": 0})
details = []
consecutive_errors = 0
pending = []
for i, uid in enumerate(uids, 1):
try:
msg = client.fetch_message(uid)
message_id = msg.get("Message-ID", f"uid-{uid.decode()}")
subject = decode_header(msg.get("Subject"))
sender = decode_header(msg.get("From"))
sender_email = extract_email_address(sender)
if not reprocess and is_processed(db, message_id):
stats["skipped"] += 1
continue
body = extract_text(msg, max_body)
# Sender cache — skip LLM if sender always gets same category
cached = get_cached_sender(db, sender_email, sender_min_hits)
if cached and not reprocess:
log.info("[%d/%d] Cached: %s -> %s (from: %s)",
i, len(uids), subject[:50], cached, sender_email)
_apply_result(
client, db, uid, message_id, subject, sender_email,
cached, True, categories, stats, details,
dry_run, False, priority_cats,
)
details[-1]["cached"] = True
stats["cached"] += 1
update_sender_cache(db, sender_email, cached)
consecutive_errors = 0
continue
# Batch mode: accumulate then classify together
if use_batch:
pending.append({
"uid": uid, "message_id": message_id,
"subject": subject, "sender": sender,
"sender_email": sender_email, "body": body,
})
if len(pending) < batch_sz and i < len(uids):
continue
log.info("Classifying batch of %d...", len(pending))
results = classify_batch(
ollama_cfg["url"], ollama_cfg["model"], categories, pending,
)
for item, (cat, conf) in zip(pending, results):
_apply_result(
client, db, item["uid"], item["message_id"],
item["subject"], item["sender_email"],
cat, conf, categories, stats, details,
dry_run, use_confidence, priority_cats,
)
update_sender_cache(db, item["sender_email"], cat)
pending = []
consecutive_errors = 0
continue
# Single classification
log.info("[%d/%d] Classifying: %s (from: %s)",
i, len(uids), subject[:60], sender[:40])
category, is_confident = classify_email(
ollama_cfg["url"], ollama_cfg["model"],
categories, subject, sender, body,
)
_apply_result(
client, db, uid, message_id, subject, sender_email,
category, is_confident, categories, stats, details,
dry_run, use_confidence, priority_cats,
)
update_sender_cache(db, sender_email, category)
consecutive_errors = 0
except Exception as e:
log.error("Error processing UID %s: %s", uid, e)
stats["errors"] += 1
consecutive_errors += 1
if consecutive_errors >= 5:
log.error("5 consecutive errors, aborting")
send_notification(
notify_cfg.get("url"),
f"Gmail Organizer: 5 consecutive errors. Last: {e}",
title="Gmail Organizer ALERT",
)
break
# Flush remaining batch
if pending:
try:
results = classify_batch(
ollama_cfg["url"], ollama_cfg["model"], categories, pending,
)
for item, (cat, conf) in zip(pending, results):
_apply_result(
client, db, item["uid"], item["message_id"],
item["subject"], item["sender_email"],
cat, conf, categories, stats, details,
dry_run, use_confidence, priority_cats,
)
update_sender_cache(db, item["sender_email"], cat)
except Exception as e:
log.error("Batch failed: %s", e)
stats["errors"] += len(pending)
log.info("Done! Stats: %s", json.dumps(stats, indent=2))
if digest or notify_cfg.get("digest"):
digest_text = generate_digest(stats, details)
log.info("\n%s", digest_text)
if notify_cfg.get("url"):
send_notification(notify_cfg["url"], digest_text)
if stats["errors"] > 0 and notify_cfg.get("url"):
send_notification(
notify_cfg["url"],
f"Completed with {stats['errors']} errors",
title="Gmail Organizer Warning",
)
finally:
client.close()
db.close()
def main():
parser = argparse.ArgumentParser(description="Gmail Organizer — LLM-powered email classification")
parser.add_argument("-c", "--config", type=Path, default=DEFAULT_CONFIG,
help="Path to config YAML (default: config.local.yaml)")
parser.add_argument("-n", "--dry-run", action="store_true",
help="Classify but don't apply labels")
parser.add_argument("--reprocess", action="store_true",
help="Re-classify already-processed emails")
parser.add_argument("--limit", type=int, default=None,
help="Override batch size")
parser.add_argument("--digest", action="store_true",
help="Print classification digest summary")
parser.add_argument("-v", "--verbose", action="store_true",
help="Debug logging")
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format=LOG_FMT,
)
if not args.config.exists():
log.error("Config not found: %s", args.config)
log.error("Copy config.yaml to config.local.yaml and fill in your credentials.")
sys.exit(1)
run(args.config, dry_run=args.dry_run, reprocess=args.reprocess,
limit=args.limit, digest=args.digest)
if __name__ == "__main__":
main()