""" Ops alerting (4B) without external monitoring stack. Runs in the scheduler process: - checks backup freshness (if backups enabled) - checks basic 24h business signals from telemetry (deal inquiries / yield clicks) - sends an aggregated email alert with cooldown to avoid spam """ from __future__ import annotations import logging import os from dataclasses import dataclass from datetime import datetime, timedelta from pathlib import Path from sqlalchemy import and_, func, select from app.config import get_settings from app.database import AsyncSessionLocal from app.models.ops_alert import OpsAlertEvent from app.models.telemetry import TelemetryEvent from app.services.email_service import CONTACT_EMAIL, email_service logger = logging.getLogger(__name__) settings = get_settings() @dataclass(frozen=True) class OpsFinding: key: str severity: str # "warn" | "page" title: str detail: str def _parse_recipients(raw: str) -> list[str]: emails = [e.strip() for e in (raw or "").split(",") if e.strip()] if emails: return emails fallback = (CONTACT_EMAIL or os.getenv("CONTACT_EMAIL", "")).strip() return [fallback] if fallback else [] def _backup_root() -> Path: root = Path(settings.backup_dir) if not root.is_absolute(): root = (Path.cwd() / root).resolve() return root def _latest_backup_age_seconds() -> float | None: root = _backup_root() if not root.exists() or not root.is_dir(): return None files = [p for p in root.glob("*") if p.is_file()] if not files: return None latest = max(files, key=lambda p: p.stat().st_mtime) now = datetime.utcnow().timestamp() return max(0.0, now - float(latest.stat().st_mtime)) async def evaluate_ops_findings() -> list[OpsFinding]: findings: list[OpsFinding] = [] # Backup stale check if settings.enable_db_backups: age = _latest_backup_age_seconds() if age is None: findings.append( OpsFinding( key="backup_missing", severity="page", title="DB backups enabled but no backup file found", detail=f"backup_dir={_backup_root()}", ) ) elif age > float(settings.ops_alert_backup_stale_seconds): findings.append( OpsFinding( key="backup_stale", severity="page", title="DB backup is stale", detail=f"latest_backup_age_seconds={int(age)} threshold={int(settings.ops_alert_backup_stale_seconds)}", ) ) # 24h telemetry signal checks (business sanity) end = datetime.utcnow() start = end - timedelta(days=1) async with AsyncSessionLocal() as db: inquiries_24h = ( await db.execute( select(func.count(TelemetryEvent.id)).where( and_( TelemetryEvent.created_at >= start, TelemetryEvent.created_at <= end, TelemetryEvent.event_name == "inquiry_created", ) ) ) ).scalar() or 0 yield_clicks_24h = ( await db.execute( select(func.count(TelemetryEvent.id)).where( and_( TelemetryEvent.created_at >= start, TelemetryEvent.created_at <= end, TelemetryEvent.event_name == "yield_click", ) ) ) ).scalar() or 0 if int(inquiries_24h) == 0: findings.append( OpsFinding( key="deal_inquiries_zero_24h", severity="warn", title="No inquiries created in last 24h", detail="Deal funnel might be broken or traffic is zero.", ) ) if int(yield_clicks_24h) == 0: findings.append( OpsFinding( key="yield_clicks_zero_24h", severity="warn", title="No yield clicks in last 24h", detail="Yield routing might be misconfigured or traffic is zero.", ) ) return findings async def _cooldown_ok(db, key: str) -> bool: cooldown = max(5, int(settings.ops_alert_cooldown_minutes)) cutoff = datetime.utcnow() - timedelta(minutes=cooldown) last_sent = ( await db.execute( select(OpsAlertEvent.created_at) .where( OpsAlertEvent.alert_key == key, OpsAlertEvent.status == "sent", OpsAlertEvent.created_at >= cutoff, ) .order_by(OpsAlertEvent.created_at.desc()) .limit(1) ) ).scalar_one_or_none() return last_sent is None async def send_ops_alerts(findings: list[OpsFinding]) -> dict: recipients = _parse_recipients(settings.ops_alert_recipients) if not recipients: logger.warning("Ops alerts enabled but no recipients configured (OPS_ALERT_RECIPIENTS/CONTACT_EMAIL).") return {"sent": 0, "skipped": len(findings), "reason": "no_recipients"} if not email_service.is_configured(): return {"sent": 0, "skipped": len(findings), "reason": "smtp_not_configured"} async with AsyncSessionLocal() as db: actionable: list[OpsFinding] = [] skipped = 0 for f in findings: if await _cooldown_ok(db, f.key): actionable.append(f) else: skipped += 1 db.add( OpsAlertEvent( alert_key=f.key, severity=f.severity, title=f.title, detail=f.detail, status="skipped", recipients=",".join(recipients) if recipients else None, send_reason="cooldown", ) ) if not actionable: await db.commit() return {"sent": 0, "skipped": len(findings), "reason": "cooldown"} sev = "PAGE" if any(f.severity == "page" for f in actionable) else "WARN" subject = f"[pounce][{sev}] Ops alerts ({len(actionable)})" items_html = "".join( f"""
{f.title}
{f.key}: {f.detail}
""".strip() for f in actionable ) html = f"""

Ops alerts

Detected {len(actionable)} issue(s). (Cooldown: {int(settings.ops_alert_cooldown_minutes)} min)

{items_html}

Timestamp: {datetime.utcnow().isoformat()}Z

""".strip() text = "\n".join([f"- [{f.severity.upper()}] {f.title} ({f.key}) :: {f.detail}" for f in actionable]) sent = 0 for to in recipients: ok = await email_service.send_email(to_email=to, subject=subject, html_content=html, text_content=text) sent += 1 if ok else 0 # Persist sent events for cooldown + history async with AsyncSessionLocal() as db: for f in actionable: db.add( OpsAlertEvent( alert_key=f.key, severity=f.severity, title=f.title, detail=f.detail, status="sent" if sent else "error", recipients=",".join(recipients) if recipients else None, send_reason=None if sent else "send_failed", ) ) await db.commit() return {"sent": sent, "actionable": len(actionable), "recipients": recipients} async def run_ops_alert_checks() -> dict: """ Entry point for scheduler/admin. Returns findings + send status (if enabled). """ findings = await evaluate_ops_findings() if not settings.ops_alerts_enabled: return {"enabled": False, "findings": [f.__dict__ for f in findings]} send_status = await send_ops_alerts(findings) return {"enabled": True, "findings": [f.__dict__ for f in findings], "send": send_status}