pounce/backend/app/services/ops_alerts.py
Yves Gugger bb7ce97330
Some checks failed
CI / Frontend Lint & Type Check (push) Has been cancelled
CI / Frontend Build (push) Has been cancelled
CI / Backend Lint (push) Has been cancelled
CI / Backend Tests (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
Deploy / Build & Push Images (push) Has been cancelled
Deploy / Deploy to Server (push) Has been cancelled
Deploy / Notify (push) Has been cancelled
Deploy: referral rewards antifraud + legal contact updates
2025-12-15 13:56:43 +01:00

257 lines
8.6 KiB
Python

"""
Ops alerting (4B) without external monitoring stack.
Runs in the scheduler process:
- checks backup freshness (if backups enabled)
- checks basic 24h business signals from telemetry (deal inquiries / yield clicks)
- sends an aggregated email alert with cooldown to avoid spam
"""
from __future__ import annotations
import logging
import os
from dataclasses import dataclass
from datetime import datetime, timedelta
from pathlib import Path
from sqlalchemy import and_, func, select
from app.config import get_settings
from app.database import AsyncSessionLocal
from app.models.ops_alert import OpsAlertEvent
from app.models.telemetry import TelemetryEvent
from app.services.email_service import CONTACT_EMAIL, email_service
logger = logging.getLogger(__name__)
settings = get_settings()
@dataclass(frozen=True)
class OpsFinding:
key: str
severity: str # "warn" | "page"
title: str
detail: str
def _parse_recipients(raw: str) -> list[str]:
emails = [e.strip() for e in (raw or "").split(",") if e.strip()]
if emails:
return emails
fallback = (CONTACT_EMAIL or os.getenv("CONTACT_EMAIL", "")).strip()
return [fallback] if fallback else []
def _backup_root() -> Path:
root = Path(settings.backup_dir)
if not root.is_absolute():
root = (Path.cwd() / root).resolve()
return root
def _latest_backup_age_seconds() -> float | None:
root = _backup_root()
if not root.exists() or not root.is_dir():
return None
files = [p for p in root.glob("*") if p.is_file()]
if not files:
return None
latest = max(files, key=lambda p: p.stat().st_mtime)
now = datetime.utcnow().timestamp()
return max(0.0, now - float(latest.stat().st_mtime))
async def evaluate_ops_findings() -> list[OpsFinding]:
findings: list[OpsFinding] = []
# Backup stale check
if settings.enable_db_backups:
age = _latest_backup_age_seconds()
if age is None:
findings.append(
OpsFinding(
key="backup_missing",
severity="page",
title="DB backups enabled but no backup file found",
detail=f"backup_dir={_backup_root()}",
)
)
elif age > float(settings.ops_alert_backup_stale_seconds):
findings.append(
OpsFinding(
key="backup_stale",
severity="page",
title="DB backup is stale",
detail=f"latest_backup_age_seconds={int(age)} threshold={int(settings.ops_alert_backup_stale_seconds)}",
)
)
# 24h telemetry signal checks (business sanity)
end = datetime.utcnow()
start = end - timedelta(days=1)
async with AsyncSessionLocal() as db:
inquiries_24h = (
await db.execute(
select(func.count(TelemetryEvent.id)).where(
and_(
TelemetryEvent.created_at >= start,
TelemetryEvent.created_at <= end,
TelemetryEvent.event_name == "inquiry_created",
)
)
)
).scalar() or 0
yield_clicks_24h = (
await db.execute(
select(func.count(TelemetryEvent.id)).where(
and_(
TelemetryEvent.created_at >= start,
TelemetryEvent.created_at <= end,
TelemetryEvent.event_name == "yield_click",
)
)
)
).scalar() or 0
if int(inquiries_24h) == 0:
findings.append(
OpsFinding(
key="deal_inquiries_zero_24h",
severity="warn",
title="No inquiries created in last 24h",
detail="Deal funnel might be broken or traffic is zero.",
)
)
if int(yield_clicks_24h) == 0:
findings.append(
OpsFinding(
key="yield_clicks_zero_24h",
severity="warn",
title="No yield clicks in last 24h",
detail="Yield routing might be misconfigured or traffic is zero.",
)
)
return findings
async def _cooldown_ok(db, key: str) -> bool:
cooldown = max(5, int(settings.ops_alert_cooldown_minutes))
cutoff = datetime.utcnow() - timedelta(minutes=cooldown)
last_sent = (
await db.execute(
select(OpsAlertEvent.created_at)
.where(
OpsAlertEvent.alert_key == key,
OpsAlertEvent.status == "sent",
OpsAlertEvent.created_at >= cutoff,
)
.order_by(OpsAlertEvent.created_at.desc())
.limit(1)
)
).scalar_one_or_none()
return last_sent is None
async def send_ops_alerts(findings: list[OpsFinding]) -> dict:
recipients = _parse_recipients(settings.ops_alert_recipients)
if not recipients:
logger.warning("Ops alerts enabled but no recipients configured (OPS_ALERT_RECIPIENTS/CONTACT_EMAIL).")
return {"sent": 0, "skipped": len(findings), "reason": "no_recipients"}
if not email_service.is_configured():
return {"sent": 0, "skipped": len(findings), "reason": "smtp_not_configured"}
async with AsyncSessionLocal() as db:
actionable: list[OpsFinding] = []
skipped = 0
for f in findings:
if await _cooldown_ok(db, f.key):
actionable.append(f)
else:
skipped += 1
db.add(
OpsAlertEvent(
alert_key=f.key,
severity=f.severity,
title=f.title,
detail=f.detail,
status="skipped",
recipients=",".join(recipients) if recipients else None,
send_reason="cooldown",
)
)
if not actionable:
await db.commit()
return {"sent": 0, "skipped": len(findings), "reason": "cooldown"}
sev = "PAGE" if any(f.severity == "page" for f in actionable) else "WARN"
subject = f"[pounce][{sev}] Ops alerts ({len(actionable)})"
items_html = "".join(
f"""
<div style="padding: 12px 14px; background: #fafafa; border-radius: 8px; border-left: 3px solid {'#ef4444' if f.severity=='page' else '#f59e0b'}; margin: 10px 0;">
<div style="font-weight:600; color:#000;">{f.title}</div>
<div style="margin-top:6px; font-size:13px; color:#444; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace;">
{f.key}: {f.detail}
</div>
</div>
""".strip()
for f in actionable
)
html = f"""
<h2 style="margin: 0 0 16px 0; font-size: 18px; font-weight: 700; color: #000000;">
Ops alerts
</h2>
<p style="margin: 0 0 16px 0; color:#333; line-height:1.6;">
Detected {len(actionable)} issue(s). (Cooldown: {int(settings.ops_alert_cooldown_minutes)} min)
</p>
{items_html}
<p style="margin: 18px 0 0 0; font-size: 12px; color:#777;">
Timestamp: {datetime.utcnow().isoformat()}Z
</p>
""".strip()
text = "\n".join([f"- [{f.severity.upper()}] {f.title} ({f.key}) :: {f.detail}" for f in actionable])
sent = 0
for to in recipients:
ok = await email_service.send_email(to_email=to, subject=subject, html_content=html, text_content=text)
sent += 1 if ok else 0
# Persist sent events for cooldown + history
async with AsyncSessionLocal() as db:
for f in actionable:
db.add(
OpsAlertEvent(
alert_key=f.key,
severity=f.severity,
title=f.title,
detail=f.detail,
status="sent" if sent else "error",
recipients=",".join(recipients) if recipients else None,
send_reason=None if sent else "send_failed",
)
)
await db.commit()
return {"sent": sent, "actionable": len(actionable), "recipients": recipients}
async def run_ops_alert_checks() -> dict:
"""
Entry point for scheduler/admin.
Returns findings + send status (if enabled).
"""
findings = await evaluate_ops_findings()
if not settings.ops_alerts_enabled:
return {"enabled": False, "findings": [f.__dict__ for f in findings]}
send_status = await send_ops_alerts(findings)
return {"enabled": True, "findings": [f.__dict__ for f in findings], "send": send_status}