#!/usr/bin/env python3 """Gmail Organizer — classifies emails using a local LLM and applies Gmail labels.""" import argparse import email import email.header import html import imaplib import json import logging import re import sqlite3 import sys import time import urllib.request import urllib.error from datetime import datetime from pathlib import Path from zoneinfo import ZoneInfo import yaml LOG_FMT = "%(asctime)s %(levelname)-8s %(message)s" log = logging.getLogger("gmail-organizer") DB_PATH = Path(__file__).parent / "processed.db" DEFAULT_CONFIG = Path(__file__).parent / "config.local.yaml" # ── helpers ────────────────────────────────────────────────────────────────── def load_config(path: Path) -> dict: with open(path) as f: return yaml.safe_load(f) def init_db(db_path: Path) -> sqlite3.Connection: conn = sqlite3.connect(db_path) conn.execute(""" CREATE TABLE IF NOT EXISTS processed ( message_id TEXT PRIMARY KEY, category TEXT NOT NULL, processed_at TEXT NOT NULL ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS sender_cache ( sender TEXT PRIMARY KEY, category TEXT NOT NULL, hit_count INTEGER DEFAULT 1, last_seen TEXT NOT NULL ) """) conn.commit() return conn def is_processed(conn: sqlite3.Connection, message_id: str) -> bool: row = conn.execute( "SELECT 1 FROM processed WHERE message_id = ?", (message_id,) ).fetchone() return row is not None def mark_processed(conn: sqlite3.Connection, message_id: str, category: str): conn.execute( "INSERT OR REPLACE INTO processed (message_id, category, processed_at) VALUES (?, ?, ?)", (message_id, category, datetime.now(tz=ZoneInfo("UTC")).isoformat()), ) conn.commit() def get_cached_sender(conn: sqlite3.Connection, sender: str, min_hits: int = 3) -> str | None: """Return cached category if sender has been classified the same way min_hits times.""" row = conn.execute( "SELECT category, hit_count FROM sender_cache WHERE sender = ? AND hit_count >= ?", (sender, min_hits), ).fetchone() return row[0] if row else None def update_sender_cache(conn: sqlite3.Connection, sender: str, category: str): """Update sender cache. Increment if same category, reset if different.""" row = conn.execute( "SELECT category FROM sender_cache WHERE sender = ?", (sender,) ).fetchone() now = datetime.now(tz=ZoneInfo("UTC")).isoformat() if row and row[0] == category: conn.execute( "UPDATE sender_cache SET hit_count = hit_count + 1, last_seen = ? WHERE sender = ?", (now, sender), ) else: conn.execute( "INSERT OR REPLACE INTO sender_cache (sender, category, hit_count, last_seen) VALUES (?, ?, 1, ?)", (sender, category, now), ) conn.commit() def decode_header(raw: str | None) -> str: if not raw: return "" parts = email.header.decode_header(raw) decoded = [] for data, charset in parts: if isinstance(data, bytes): decoded.append(data.decode(charset or "utf-8", errors="replace")) else: decoded.append(data) return " ".join(decoded) def extract_email_address(from_header: str) -> str: """Extract just the email address from a From header.""" match = re.search(r'<([^>]+)>', from_header) return match.group(1).lower() if match else from_header.strip().lower() def extract_text(msg: email.message.Message, max_chars: int) -> str: """Extract plain-text body from an email, falling back to stripped HTML.""" body = "" if msg.is_multipart(): for part in msg.walk(): ct = part.get_content_type() if ct == "text/plain": payload = part.get_payload(decode=True) if payload: charset = part.get_content_charset() or "utf-8" body = payload.decode(charset, errors="replace") break elif ct == "text/html" and not body: payload = part.get_payload(decode=True) if payload: charset = part.get_content_charset() or "utf-8" raw_html = payload.decode(charset, errors="replace") body = html.unescape(re.sub(r"<[^>]+>", " ", raw_html)) else: payload = msg.get_payload(decode=True) if payload: charset = msg.get_content_charset() or "utf-8" body = payload.decode(charset, errors="replace") if msg.get_content_type() == "text/html": body = html.unescape(re.sub(r"<[^>]+>", " ", body)) body = re.sub(r"\s+", " ", body).strip() return body[:max_chars] # ── Gmail IMAP ─────────────────────────────────────────────────────────────── class GmailClient: def __init__(self, email_addr: str, app_password: "REDACTED_PASSWORD" self.email = email_addr self.conn = imaplib.IMAP4_SSL("imap.gmail.com") self.conn.login(email_addr, app_password) def fetch_uids(self, mailbox: str = "INBOX", search: str = "ALL", batch_size: int = 50) -> list[bytes]: self.conn.select(mailbox) _, data = self.conn.search(None, search) uids = data[0].split() return list(reversed(uids[-batch_size:])) def fetch_message(self, uid: bytes) -> email.message.Message: _, data = self.conn.fetch(uid, "(RFC822)") return email.message_from_bytes(data[0][1]) def get_labels(self, uid: bytes) -> list[str]: """Get existing Gmail labels (X-GM-LABELS) for a message.""" _, data = self.conn.fetch(uid, "(X-GM-LABELS)") raw = data[0].decode() if isinstance(data[0], bytes) else str(data[0]) match = re.search(r'X-GM-LABELS \(([^)]*)\)', raw) if match: return match.group(1).split() return [] def apply_label(self, uid: bytes, label: str): """Apply a Gmail label to a message. Creates the label if needed.""" result = self.conn.store(uid, "+X-GM-LABELS", f'("{label}")') if result[0] != "OK": try: self.conn.create(label) except imaplib.IMAP4.error: pass self.conn.copy(uid, label) def archive(self, uid: bytes): """Archive a message (remove from INBOX).""" self.conn.store(uid, "-X-GM-LABELS", '("\\\\Inbox")') def star(self, uid: bytes): """Star/flag a message for priority.""" self.conn.store(uid, "+FLAGS", "\\Flagged") def close(self): try: self.conn.close() self.conn.logout() except Exception: pass # ── Ollama LLM ─────────────────────────────────────────────────────────────── def _ollama_request(ollama_url: str, payload: dict, max_retries: int = 3) -> dict: """Make an Ollama API request with retry and exponential backoff.""" data = json.dumps(payload).encode() req = urllib.request.Request( f"{ollama_url.rstrip('/')}/api/generate", data=data, headers={"Content-Type": "application/json"}, ) last_error = None for attempt in range(max_retries): try: with urllib.request.urlopen(req, timeout=60) as resp: return json.loads(resp.read()) except (urllib.error.URLError, TimeoutError) as e: last_error = e if attempt < max_retries - 1: wait = 2 ** attempt log.warning("Ollama attempt %d/%d failed: %s — retrying in %ds", attempt + 1, max_retries, e, wait) time.sleep(wait) raise last_error def classify_email( ollama_url: str, model: str, categories: dict, subject: str, sender: str, body_snippet: str, ) -> tuple[str, bool]: """Classify an email. Returns (category, is_confident).""" cat_descriptions = "\n".join( f"- **{name}**: {info['description']}" for name, info in categories.items() ) category_names = ", ".join(categories.keys()) prompt = f"""Classify this email into exactly ONE category. Reply with: CATEGORY CONFIDENCE Where confidence is high, medium, or low. Categories: {cat_descriptions} Email: From: {sender} Subject: {subject} Body: {body_snippet[:1000]} Reply with one of [{category_names}] followed by [high/medium/low]:""" result = _ollama_request(ollama_url, { "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 30}, }) raw = result.get("response", "").strip().lower() raw = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() is_confident = "low" not in raw for name in categories: if name in raw: return name, is_confident log.warning("LLM returned unexpected category %r, defaulting to 'personal'", raw) return "personal", False def classify_batch( ollama_url: str, model: str, categories: dict, emails: list[dict], ) -> list[tuple[str, bool]]: """Classify multiple emails in one LLM call.""" if len(emails) == 1: e = emails[0] return [classify_email(ollama_url, model, categories, e["subject"], e["sender"], e["body"])] cat_descriptions = "\n".join( f"- **{name}**: {info['description']}" for name, info in categories.items() ) category_names = ", ".join(categories.keys()) sections = [] for i, e in enumerate(emails, 1): sections.append( f"[Email {i}]\nFrom: {e['sender']}\nSubject: {e['subject']}\nBody: {e['body'][:500]}" ) prompt = f"""Classify each email into ONE category. Reply with one line per email: NUMBER: CATEGORY CONFIDENCE Categories: {cat_descriptions} Valid: {category_names} Confidence: high, medium, or low {chr(10).join(sections)} Reply:""" result = _ollama_request(ollama_url, { "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 20 * len(emails)}, }) raw = result.get("response", "").strip().lower() raw = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() results = [] lines = raw.split("\n") for i, e in enumerate(emails): found = False for line in lines: if str(i + 1) in line: confident = "low" not in line for name in categories: if name in line: results.append((name, confident)) found = True break if found: break if not found: results.append(classify_email( ollama_url, model, categories, e["subject"], e["sender"], e["body"], )) return results # ── notifications ──────────────────────────────────────────────────────────── def send_notification(url: str, message: str, title: str = "Gmail Organizer"): """Send a notification via ntfy/webhook URL.""" if not url: return try: req = urllib.request.Request(url, data=message.encode(), headers={ "Title": title, "Content-Type": "text/plain", }) with urllib.request.urlopen(req, timeout=10): pass except Exception as e: log.warning("Failed to send notification: %s", e) # ── digest ─────────────────────────────────────────────────────────────────── def generate_digest(stats: dict, details: list[dict]) -> str: """Generate a summary of what was processed.""" lines = ["Gmail Organizer Summary", "=" * 25, ""] classified = sum(v for k, v in stats.items() if k not in ("skipped", "errors", "cached", "low_confidence")) lines.append(f"Classified: {classified}") lines.append(f"Cached (sender match): {stats.get('cached', 0)}") lines.append(f"Skipped (already done): {stats.get('skipped', 0)}") lines.append(f"Low confidence: {stats.get('low_confidence', 0)}") lines.append(f"Errors: {stats.get('errors', 0)}") if details: lines.append("") lines.append("Details:") for d in details: tag = " [cached]" if d.get("cached") else "" lines.append(f" {d['category']:>12} | {d['subject'][:50]}{tag}") return "\n".join(lines) # ── main ───────────────────────────────────────────────────────────────────── def _apply_result( client, db, uid, message_id, subject, sender_email, category, is_confident, categories, stats, details, dry_run, use_confidence, priority_cats, ): """Apply classification: label, archive, star, record.""" if use_confidence and not is_confident: log.info(" -> %s (low confidence, skipping)", category) stats["low_confidence"] += 1 return label = categories[category]["label"] should_archive = categories[category].get("archive", False) should_star = category in priority_cats log.info(" -> %s (%s)", category, label) if not dry_run: client.apply_label(uid, label) if should_archive: client.archive(uid) log.info(" archived") if should_star: client.star(uid) log.info(" starred") mark_processed(db, message_id, category) else: extras = [] if should_archive: extras.append("archive") if should_star: extras.append("star") log.info(" [DRY RUN] %s%s", label, (" + " + ", ".join(extras)) if extras else "") stats[category] = stats.get(category, 0) + 1 details.append({"subject": subject, "category": category, "sender": sender_email, "cached": False}) def run(config_path: Path, dry_run: bool = False, reprocess: bool = False, limit: int | None = None, digest: bool = False): cfg = load_config(config_path) gmail_cfg = cfg["gmail"] ollama_cfg = cfg["ollama"] categories = cfg["categories"] proc_cfg = cfg.get("processing", {}) notify_cfg = cfg.get("notifications", {}) batch_size = limit or proc_cfg.get("batch_size", 50) max_body = proc_cfg.get("max_body_chars", 2000) dry_run = dry_run or proc_cfg.get("dry_run", False) mailbox = proc_cfg.get("mailbox", "INBOX") use_confidence = proc_cfg.get("confidence_threshold", True) sender_min_hits = proc_cfg.get("sender_cache_min_hits", 3) use_batch = proc_cfg.get("batch_classify", False) batch_sz = proc_cfg.get("batch_classify_size", 5) priority_cats = proc_cfg.get("priority_star", []) log.info("Connecting to Gmail as %s", gmail_cfg["email"]) client = GmailClient(gmail_cfg["email"], gmail_cfg["app_password"]) db = init_db(DB_PATH) try: uids = client.fetch_uids(mailbox=mailbox, batch_size=batch_size) log.info("Fetched %d message UIDs", len(uids)) stats = {cat: 0 for cat in categories} stats.update({"skipped": 0, "errors": 0, "cached": 0, "low_confidence": 0}) details = [] consecutive_errors = 0 pending = [] for i, uid in enumerate(uids, 1): try: msg = client.fetch_message(uid) message_id = msg.get("Message-ID", f"uid-{uid.decode()}") subject = decode_header(msg.get("Subject")) sender = decode_header(msg.get("From")) sender_email = extract_email_address(sender) if not reprocess and is_processed(db, message_id): stats["skipped"] += 1 continue body = extract_text(msg, max_body) # Sender cache — skip LLM if sender always gets same category cached = get_cached_sender(db, sender_email, sender_min_hits) if cached and not reprocess: log.info("[%d/%d] Cached: %s -> %s (from: %s)", i, len(uids), subject[:50], cached, sender_email) _apply_result( client, db, uid, message_id, subject, sender_email, cached, True, categories, stats, details, dry_run, False, priority_cats, ) details[-1]["cached"] = True stats["cached"] += 1 update_sender_cache(db, sender_email, cached) consecutive_errors = 0 continue # Batch mode: accumulate then classify together if use_batch: pending.append({ "uid": uid, "message_id": message_id, "subject": subject, "sender": sender, "sender_email": sender_email, "body": body, }) if len(pending) < batch_sz and i < len(uids): continue log.info("Classifying batch of %d...", len(pending)) results = classify_batch( ollama_cfg["url"], ollama_cfg["model"], categories, pending, ) for item, (cat, conf) in zip(pending, results): _apply_result( client, db, item["uid"], item["message_id"], item["subject"], item["sender_email"], cat, conf, categories, stats, details, dry_run, use_confidence, priority_cats, ) update_sender_cache(db, item["sender_email"], cat) pending = [] consecutive_errors = 0 continue # Single classification log.info("[%d/%d] Classifying: %s (from: %s)", i, len(uids), subject[:60], sender[:40]) category, is_confident = classify_email( ollama_cfg["url"], ollama_cfg["model"], categories, subject, sender, body, ) _apply_result( client, db, uid, message_id, subject, sender_email, category, is_confident, categories, stats, details, dry_run, use_confidence, priority_cats, ) update_sender_cache(db, sender_email, category) consecutive_errors = 0 except Exception as e: log.error("Error processing UID %s: %s", uid, e) stats["errors"] += 1 consecutive_errors += 1 if consecutive_errors >= 5: log.error("5 consecutive errors, aborting") send_notification( notify_cfg.get("url"), f"Gmail Organizer: 5 consecutive errors. Last: {e}", title="Gmail Organizer ALERT", ) break # Flush remaining batch if pending: try: results = classify_batch( ollama_cfg["url"], ollama_cfg["model"], categories, pending, ) for item, (cat, conf) in zip(pending, results): _apply_result( client, db, item["uid"], item["message_id"], item["subject"], item["sender_email"], cat, conf, categories, stats, details, dry_run, use_confidence, priority_cats, ) update_sender_cache(db, item["sender_email"], cat) except Exception as e: log.error("Batch failed: %s", e) stats["errors"] += len(pending) log.info("Done! Stats: %s", json.dumps(stats, indent=2)) if digest or notify_cfg.get("digest"): digest_text = generate_digest(stats, details) log.info("\n%s", digest_text) if notify_cfg.get("url"): send_notification(notify_cfg["url"], digest_text) if stats["errors"] > 0 and notify_cfg.get("url"): send_notification( notify_cfg["url"], f"Completed with {stats['errors']} errors", title="Gmail Organizer Warning", ) finally: client.close() db.close() def main(): parser = argparse.ArgumentParser(description="Gmail Organizer — LLM-powered email classification") parser.add_argument("-c", "--config", type=Path, default=DEFAULT_CONFIG, help="Path to config YAML (default: config.local.yaml)") parser.add_argument("-n", "--dry-run", action="store_true", help="Classify but don't apply labels") parser.add_argument("--reprocess", action="store_true", help="Re-classify already-processed emails") parser.add_argument("--limit", type=int, default=None, help="Override batch size") parser.add_argument("--digest", action="store_true", help="Print classification digest summary") parser.add_argument("-v", "--verbose", action="store_true", help="Debug logging") args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format=LOG_FMT, ) if not args.config.exists(): log.error("Config not found: %s", args.config) log.error("Copy config.yaml to config.local.yaml and fill in your credentials.") sys.exit(1) run(args.config, dry_run=args.dry_run, reprocess=args.reprocess, limit=args.limit, digest=args.digest) if __name__ == "__main__": main()