mediaproc/core/detect/inference/client.py

"""
HTTP client for the inference server.

The pipeline stages call this instead of importing ML libraries directly.
The inference server runs on the GPU machine (or spot instance).
"""

from __future__ import annotations

import base64
import io
import logging
import os

import numpy as np
import requests
from PIL import Image

from .types import DetectResult, OCRResult, RegionDebugResult, RegionResult, ServerStatus, VLMResult

logger = logging.getLogger(__name__)

DEFAULT_URL = os.environ.get("INFERENCE_URL", "http://localhost:8000")


def _encode_image(image: np.ndarray) -> str:
    """Encode numpy array as base64 JPEG."""
    img = Image.fromarray(image)
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=85)
    return base64.b64encode(buf.getvalue()).decode()


class InferenceClient:
    """HTTP client for the GPU inference server."""

    def __init__(self, base_url: str | None = None, timeout: float = 60.0,
                 job_id: str = "", log_level: str = "INFO"):
        self.base_url = (base_url or DEFAULT_URL).rstrip("/")
        self.timeout = timeout
        self.job_id = job_id
        self.log_level = log_level
        self.session = requests.Session()
        if job_id:
            self.session.headers["X-Job-Id"] = job_id
            self.session.headers["X-Log-Level"] = log_level

    def health(self) -> ServerStatus:
        """Check server health and loaded models."""
        resp = self.session.get(f"{self.base_url}/health", timeout=self.timeout)
        resp.raise_for_status()
        data = resp.json()
        return ServerStatus(
            loaded_models=data.get("loaded_models", []),
            vram_used_mb=data.get("vram_used_mb", 0),
            vram_budget_mb=data.get("vram_budget_mb", 0),
            strategy=data.get("strategy", "sequential"),
        )

    def detect(
        self,
        image: np.ndarray,
        model: str = "yolov8n",
        confidence: float = 0.3,
        target_classes: list[str] | None = None,
    ) -> list[DetectResult]:
        """Run object detection on an image."""
        payload = {
            "image": _encode_image(image),
            "model": model,
            "confidence": confidence,
        }
        if target_classes:
            payload["target_classes"] = target_classes

        resp = self.session.post(
            f"{self.base_url}/detect",
            json=payload,
            timeout=self.timeout,
        )
        resp.raise_for_status()

        results = []
        for d in resp.json().get("detections", []):
            result = DetectResult(
                x=d["x"], y=d["y"], w=d["w"], h=d["h"],
                confidence=d["confidence"], label=d["label"],
            )
            results.append(result)
        return results

    def ocr(
        self,
        image: np.ndarray,
        languages: list[str] | None = None,
    ) -> list[OCRResult]:
        """Run OCR on an image region."""
        payload = {
            "image": _encode_image(image),
        }
        if languages:
            payload["languages"] = languages

        resp = self.session.post(
            f"{self.base_url}/ocr",
            json=payload,
            timeout=self.timeout,
        )
        resp.raise_for_status()

        results = []
        for d in resp.json().get("results", []):
            result = OCRResult(
                text=d["text"],
                confidence=d["confidence"],
                bbox=tuple(d["bbox"]),
            )
            results.append(result)
        return results

    def vlm(
        self,
        image: np.ndarray,
        prompt: str,
        model: str = "moondream2",
    ) -> VLMResult:
        """Query a visual language model with an image crop + prompt."""
        payload = {
            "image": _encode_image(image),
            "prompt": prompt,
            "model": model,
        }

        resp = self.session.post(
            f"{self.base_url}/vlm",
            json=payload,
            timeout=self.timeout,
        )
        resp.raise_for_status()

        data = resp.json()
        return VLMResult(
            brand=data.get("brand", ""),
            confidence=data.get("confidence", 0.0),
            reasoning=data.get("reasoning", ""),
        )

    def detect_edges(
        self,
        image: np.ndarray,
        edge_canny_low: int = 50,
        edge_canny_high: int = 150,
        edge_hough_threshold: int = 80,
        edge_hough_min_length: int = 100,
        edge_hough_max_gap: int = 10,
        edge_pair_max_distance: int = 200,
        edge_pair_min_distance: int = 15,
    ) -> list[RegionResult]:
        """Run edge detection on an image."""
        payload = {
            "image": _encode_image(image),
            "edge_canny_low": edge_canny_low,
            "edge_canny_high": edge_canny_high,
            "edge_hough_threshold": edge_hough_threshold,
            "edge_hough_min_length": edge_hough_min_length,
            "edge_hough_max_gap": edge_hough_max_gap,
            "edge_pair_max_distance": edge_pair_max_distance,
            "edge_pair_min_distance": edge_pair_min_distance,
        }

        resp = self.session.post(
            f"{self.base_url}/detect_edges",
            json=payload,
            timeout=self.timeout,
        )
        resp.raise_for_status()

        results = []
        for r in resp.json().get("regions", []):
            result = RegionResult(
                x=r["x"], y=r["y"], w=r["w"], h=r["h"],
                confidence=r["confidence"], label=r["label"],
            )
            results.append(result)
        return results

    def detect_edges_debug(
        self,
        image: np.ndarray,
        edge_canny_low: int = 50,
        edge_canny_high: int = 150,
        edge_hough_threshold: int = 80,
        edge_hough_min_length: int = 100,
        edge_hough_max_gap: int = 10,
        edge_pair_max_distance: int = 200,
        edge_pair_min_distance: int = 15,
    ) -> RegionDebugResult:
        """Run edge detection with debug overlays."""
        payload = {
            "image": _encode_image(image),
            "edge_canny_low": edge_canny_low,
            "edge_canny_high": edge_canny_high,
            "edge_hough_threshold": edge_hough_threshold,
            "edge_hough_min_length": edge_hough_min_length,
            "edge_hough_max_gap": edge_hough_max_gap,
            "edge_pair_max_distance": edge_pair_max_distance,
            "edge_pair_min_distance": edge_pair_min_distance,
        }

        resp = self.session.post(
            f"{self.base_url}/detect_edges/debug",
            json=payload,
            timeout=self.timeout,
        )
        resp.raise_for_status()

        data = resp.json()
        regions = []
        for r in data.get("regions", []):
            region = RegionResult(
                x=r["x"], y=r["y"], w=r["w"], h=r["h"],
                confidence=r["confidence"], label=r["label"],
            )
            regions.append(region)

        return RegionDebugResult(
            regions=regions,
            edge_overlay_b64=data.get("edge_overlay_b64", ""),
            lines_overlay_b64=data.get("lines_overlay_b64", ""),
            horizontal_count=data.get("horizontal_count", 0),
            pair_count=data.get("pair_count", 0),
        )

    def post(self, path: str, payload: dict) -> dict | None:
        """Generic POST to the inference server. Returns JSON response or None on error."""
        try:
            resp = self.session.post(
                f"{self.base_url}{path}",
                json=payload,
                timeout=self.timeout,
            )
            resp.raise_for_status()
            return resp.json()
        except Exception as e:
            logger.warning("Inference POST %s failed: %s", path, e)
            return None

    def load_model(self, model: str, quantization: str = "fp16") -> None:
        """Request the server to load a model into VRAM."""
        self.session.post(
            f"{self.base_url}/models/load",
            json={"model": model, "quantization": quantization},
            timeout=self.timeout,
        ).raise_for_status()

    def unload_model(self, model: str) -> None:
        """Request the server to unload a model from VRAM."""
        self.session.post(
            f"{self.base_url}/models/unload",
            json={"model": model},
            timeout=self.timeout,
        ).raise_for_status()