pounce/backend/app/services/llm_gateway.py

from __future__ import annotations

import json
from typing import Any, AsyncIterator, Optional

import httpx

from app.config import get_settings


settings = get_settings()


class LLMGatewayError(RuntimeError):
    pass


def _auth_headers() -> dict[str, str]:
    key = (settings.llm_gateway_api_key or "").strip()
    if not key:
        raise LLMGatewayError("LLM gateway not configured (missing llm_gateway_api_key)")
    return {"Authorization": f"Bearer {key}"}


async def chat_completions(payload: dict[str, Any]) -> dict[str, Any]:
    """
    Non-streaming call to the LLM gateway (OpenAI-ish format).
    """
    url = settings.llm_gateway_url.rstrip("/") + "/v1/chat/completions"
    async with httpx.AsyncClient(timeout=60) as client:
        r = await client.post(url, headers=_auth_headers(), json=payload)
        if r.status_code >= 400:
            raise LLMGatewayError(f"LLM gateway error: {r.status_code} {r.text[:500]}")
        return r.json()


async def chat_completions_stream(payload: dict[str, Any]) -> AsyncIterator[bytes]:
    """
    Streaming call to the LLM gateway. The gateway returns SSE; we proxy bytes through.
    """
    url = settings.llm_gateway_url.rstrip("/") + "/v1/chat/completions"
    timeout = httpx.Timeout(connect=10, read=None, write=10, pool=10)
    async with httpx.AsyncClient(timeout=timeout) as client:
        async with client.stream("POST", url, headers=_auth_headers(), json=payload) as r:
            if r.status_code >= 400:
                body = await r.aread()
                raise LLMGatewayError(f"LLM gateway stream error: {r.status_code} {body[:500].decode('utf-8','ignore')}")

            async for chunk in r.aiter_bytes():
                if chunk:
                    yield chunk