Some checks failed
CI / Frontend Lint & Type Check (push) Has been cancelled
CI / Frontend Build (push) Has been cancelled
CI / Backend Lint (push) Has been cancelled
CI / Backend Tests (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
Deploy / Build & Push Images (push) Has been cancelled
Deploy / Deploy to Server (push) Has been cancelled
Deploy / Notify (push) Has been cancelled
257 lines
8.6 KiB
Python
257 lines
8.6 KiB
Python
"""
|
|
Ops alerting (4B) without external monitoring stack.
|
|
|
|
Runs in the scheduler process:
|
|
- checks backup freshness (if backups enabled)
|
|
- checks basic 24h business signals from telemetry (deal inquiries / yield clicks)
|
|
- sends an aggregated email alert with cooldown to avoid spam
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
|
|
from sqlalchemy import and_, func, select
|
|
|
|
from app.config import get_settings
|
|
from app.database import AsyncSessionLocal
|
|
from app.models.ops_alert import OpsAlertEvent
|
|
from app.models.telemetry import TelemetryEvent
|
|
from app.services.email_service import CONTACT_EMAIL, email_service
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
settings = get_settings()
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class OpsFinding:
|
|
key: str
|
|
severity: str # "warn" | "page"
|
|
title: str
|
|
detail: str
|
|
|
|
|
|
def _parse_recipients(raw: str) -> list[str]:
|
|
emails = [e.strip() for e in (raw or "").split(",") if e.strip()]
|
|
if emails:
|
|
return emails
|
|
fallback = (CONTACT_EMAIL or os.getenv("CONTACT_EMAIL", "")).strip()
|
|
return [fallback] if fallback else []
|
|
|
|
|
|
def _backup_root() -> Path:
|
|
root = Path(settings.backup_dir)
|
|
if not root.is_absolute():
|
|
root = (Path.cwd() / root).resolve()
|
|
return root
|
|
|
|
|
|
def _latest_backup_age_seconds() -> float | None:
|
|
root = _backup_root()
|
|
if not root.exists() or not root.is_dir():
|
|
return None
|
|
files = [p for p in root.glob("*") if p.is_file()]
|
|
if not files:
|
|
return None
|
|
latest = max(files, key=lambda p: p.stat().st_mtime)
|
|
now = datetime.utcnow().timestamp()
|
|
return max(0.0, now - float(latest.stat().st_mtime))
|
|
|
|
|
|
async def evaluate_ops_findings() -> list[OpsFinding]:
|
|
findings: list[OpsFinding] = []
|
|
|
|
# Backup stale check
|
|
if settings.enable_db_backups:
|
|
age = _latest_backup_age_seconds()
|
|
if age is None:
|
|
findings.append(
|
|
OpsFinding(
|
|
key="backup_missing",
|
|
severity="page",
|
|
title="DB backups enabled but no backup file found",
|
|
detail=f"backup_dir={_backup_root()}",
|
|
)
|
|
)
|
|
elif age > float(settings.ops_alert_backup_stale_seconds):
|
|
findings.append(
|
|
OpsFinding(
|
|
key="backup_stale",
|
|
severity="page",
|
|
title="DB backup is stale",
|
|
detail=f"latest_backup_age_seconds={int(age)} threshold={int(settings.ops_alert_backup_stale_seconds)}",
|
|
)
|
|
)
|
|
|
|
# 24h telemetry signal checks (business sanity)
|
|
end = datetime.utcnow()
|
|
start = end - timedelta(days=1)
|
|
|
|
async with AsyncSessionLocal() as db:
|
|
inquiries_24h = (
|
|
await db.execute(
|
|
select(func.count(TelemetryEvent.id)).where(
|
|
and_(
|
|
TelemetryEvent.created_at >= start,
|
|
TelemetryEvent.created_at <= end,
|
|
TelemetryEvent.event_name == "inquiry_created",
|
|
)
|
|
)
|
|
)
|
|
).scalar() or 0
|
|
|
|
yield_clicks_24h = (
|
|
await db.execute(
|
|
select(func.count(TelemetryEvent.id)).where(
|
|
and_(
|
|
TelemetryEvent.created_at >= start,
|
|
TelemetryEvent.created_at <= end,
|
|
TelemetryEvent.event_name == "yield_click",
|
|
)
|
|
)
|
|
)
|
|
).scalar() or 0
|
|
|
|
if int(inquiries_24h) == 0:
|
|
findings.append(
|
|
OpsFinding(
|
|
key="deal_inquiries_zero_24h",
|
|
severity="warn",
|
|
title="No inquiries created in last 24h",
|
|
detail="Deal funnel might be broken or traffic is zero.",
|
|
)
|
|
)
|
|
|
|
if int(yield_clicks_24h) == 0:
|
|
findings.append(
|
|
OpsFinding(
|
|
key="yield_clicks_zero_24h",
|
|
severity="warn",
|
|
title="No yield clicks in last 24h",
|
|
detail="Yield routing might be misconfigured or traffic is zero.",
|
|
)
|
|
)
|
|
|
|
return findings
|
|
|
|
|
|
async def _cooldown_ok(db, key: str) -> bool:
|
|
cooldown = max(5, int(settings.ops_alert_cooldown_minutes))
|
|
cutoff = datetime.utcnow() - timedelta(minutes=cooldown)
|
|
last_sent = (
|
|
await db.execute(
|
|
select(OpsAlertEvent.created_at)
|
|
.where(
|
|
OpsAlertEvent.alert_key == key,
|
|
OpsAlertEvent.status == "sent",
|
|
OpsAlertEvent.created_at >= cutoff,
|
|
)
|
|
.order_by(OpsAlertEvent.created_at.desc())
|
|
.limit(1)
|
|
)
|
|
).scalar_one_or_none()
|
|
return last_sent is None
|
|
|
|
|
|
async def send_ops_alerts(findings: list[OpsFinding]) -> dict:
|
|
recipients = _parse_recipients(settings.ops_alert_recipients)
|
|
if not recipients:
|
|
logger.warning("Ops alerts enabled but no recipients configured (OPS_ALERT_RECIPIENTS/CONTACT_EMAIL).")
|
|
return {"sent": 0, "skipped": len(findings), "reason": "no_recipients"}
|
|
if not email_service.is_configured():
|
|
return {"sent": 0, "skipped": len(findings), "reason": "smtp_not_configured"}
|
|
|
|
async with AsyncSessionLocal() as db:
|
|
actionable: list[OpsFinding] = []
|
|
skipped = 0
|
|
for f in findings:
|
|
if await _cooldown_ok(db, f.key):
|
|
actionable.append(f)
|
|
else:
|
|
skipped += 1
|
|
db.add(
|
|
OpsAlertEvent(
|
|
alert_key=f.key,
|
|
severity=f.severity,
|
|
title=f.title,
|
|
detail=f.detail,
|
|
status="skipped",
|
|
recipients=",".join(recipients) if recipients else None,
|
|
send_reason="cooldown",
|
|
)
|
|
)
|
|
|
|
if not actionable:
|
|
await db.commit()
|
|
return {"sent": 0, "skipped": len(findings), "reason": "cooldown"}
|
|
|
|
sev = "PAGE" if any(f.severity == "page" for f in actionable) else "WARN"
|
|
subject = f"[pounce][{sev}] Ops alerts ({len(actionable)})"
|
|
|
|
items_html = "".join(
|
|
f"""
|
|
<div style="padding: 12px 14px; background: #fafafa; border-radius: 8px; border-left: 3px solid {'#ef4444' if f.severity=='page' else '#f59e0b'}; margin: 10px 0;">
|
|
<div style="font-weight:600; color:#000;">{f.title}</div>
|
|
<div style="margin-top:6px; font-size:13px; color:#444; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace;">
|
|
{f.key}: {f.detail}
|
|
</div>
|
|
</div>
|
|
""".strip()
|
|
for f in actionable
|
|
)
|
|
|
|
html = f"""
|
|
<h2 style="margin: 0 0 16px 0; font-size: 18px; font-weight: 700; color: #000000;">
|
|
Ops alerts
|
|
</h2>
|
|
<p style="margin: 0 0 16px 0; color:#333; line-height:1.6;">
|
|
Detected {len(actionable)} issue(s). (Cooldown: {int(settings.ops_alert_cooldown_minutes)} min)
|
|
</p>
|
|
{items_html}
|
|
<p style="margin: 18px 0 0 0; font-size: 12px; color:#777;">
|
|
Timestamp: {datetime.utcnow().isoformat()}Z
|
|
</p>
|
|
""".strip()
|
|
|
|
text = "\n".join([f"- [{f.severity.upper()}] {f.title} ({f.key}) :: {f.detail}" for f in actionable])
|
|
sent = 0
|
|
for to in recipients:
|
|
ok = await email_service.send_email(to_email=to, subject=subject, html_content=html, text_content=text)
|
|
sent += 1 if ok else 0
|
|
# Persist sent events for cooldown + history
|
|
async with AsyncSessionLocal() as db:
|
|
for f in actionable:
|
|
db.add(
|
|
OpsAlertEvent(
|
|
alert_key=f.key,
|
|
severity=f.severity,
|
|
title=f.title,
|
|
detail=f.detail,
|
|
status="sent" if sent else "error",
|
|
recipients=",".join(recipients) if recipients else None,
|
|
send_reason=None if sent else "send_failed",
|
|
)
|
|
)
|
|
await db.commit()
|
|
|
|
return {"sent": sent, "actionable": len(actionable), "recipients": recipients}
|
|
|
|
|
|
async def run_ops_alert_checks() -> dict:
|
|
"""
|
|
Entry point for scheduler/admin.
|
|
Returns findings + send status (if enabled).
|
|
"""
|
|
findings = await evaluate_ops_findings()
|
|
if not settings.ops_alerts_enabled:
|
|
return {"enabled": False, "findings": [f.__dict__ for f in findings]}
|
|
|
|
send_status = await send_ops_alerts(findings)
|
|
return {"enabled": True, "findings": [f.__dict__ for f in findings], "send": send_status}
|
|
|