""" HTTP client for the inference server. The pipeline stages call this instead of importing ML libraries directly. The inference server runs on the GPU machine (or spot instance). """ from __future__ import annotations import base64 import io import logging import os import numpy as np import requests from PIL import Image from .types import DetectResult, OCRResult, RegionDebugResult, RegionResult, ServerStatus, VLMResult logger = logging.getLogger(__name__) DEFAULT_URL = os.environ.get("INFERENCE_URL", "http://localhost:8000") def _encode_image(image: np.ndarray) -> str: """Encode numpy array as base64 JPEG.""" img = Image.fromarray(image) buf = io.BytesIO() img.save(buf, format="JPEG", quality=85) return base64.b64encode(buf.getvalue()).decode() class InferenceClient: """HTTP client for the GPU inference server.""" def __init__(self, base_url: str | None = None, timeout: float = 60.0, job_id: str = "", log_level: str = "INFO"): self.base_url = (base_url or DEFAULT_URL).rstrip("/") self.timeout = timeout self.job_id = job_id self.log_level = log_level self.session = requests.Session() if job_id: self.session.headers["X-Job-Id"] = job_id self.session.headers["X-Log-Level"] = log_level def health(self) -> ServerStatus: """Check server health and loaded models.""" resp = self.session.get(f"{self.base_url}/health", timeout=self.timeout) resp.raise_for_status() data = resp.json() return ServerStatus( loaded_models=data.get("loaded_models", []), vram_used_mb=data.get("vram_used_mb", 0), vram_budget_mb=data.get("vram_budget_mb", 0), strategy=data.get("strategy", "sequential"), ) def detect( self, image: np.ndarray, model: str = "yolov8n", confidence: float = 0.3, target_classes: list[str] | None = None, ) -> list[DetectResult]: """Run object detection on an image.""" payload = { "image": _encode_image(image), "model": model, "confidence": confidence, } if target_classes: payload["target_classes"] = target_classes resp = self.session.post( f"{self.base_url}/detect", json=payload, timeout=self.timeout, ) resp.raise_for_status() results = [] for d in resp.json().get("detections", []): result = DetectResult( x=d["x"], y=d["y"], w=d["w"], h=d["h"], confidence=d["confidence"], label=d["label"], ) results.append(result) return results def ocr( self, image: np.ndarray, languages: list[str] | None = None, ) -> list[OCRResult]: """Run OCR on an image region.""" payload = { "image": _encode_image(image), } if languages: payload["languages"] = languages resp = self.session.post( f"{self.base_url}/ocr", json=payload, timeout=self.timeout, ) resp.raise_for_status() results = [] for d in resp.json().get("results", []): result = OCRResult( text=d["text"], confidence=d["confidence"], bbox=tuple(d["bbox"]), ) results.append(result) return results def vlm( self, image: np.ndarray, prompt: str, model: str = "moondream2", ) -> VLMResult: """Query a visual language model with an image crop + prompt.""" payload = { "image": _encode_image(image), "prompt": prompt, "model": model, } resp = self.session.post( f"{self.base_url}/vlm", json=payload, timeout=self.timeout, ) resp.raise_for_status() data = resp.json() return VLMResult( brand=data.get("brand", ""), confidence=data.get("confidence", 0.0), reasoning=data.get("reasoning", ""), ) def detect_edges( self, image: np.ndarray, edge_canny_low: int = 50, edge_canny_high: int = 150, edge_hough_threshold: int = 80, edge_hough_min_length: int = 100, edge_hough_max_gap: int = 10, edge_pair_max_distance: int = 200, edge_pair_min_distance: int = 15, ) -> list[RegionResult]: """Run edge detection on an image.""" payload = { "image": _encode_image(image), "edge_canny_low": edge_canny_low, "edge_canny_high": edge_canny_high, "edge_hough_threshold": edge_hough_threshold, "edge_hough_min_length": edge_hough_min_length, "edge_hough_max_gap": edge_hough_max_gap, "edge_pair_max_distance": edge_pair_max_distance, "edge_pair_min_distance": edge_pair_min_distance, } resp = self.session.post( f"{self.base_url}/detect_edges", json=payload, timeout=self.timeout, ) resp.raise_for_status() results = [] for r in resp.json().get("regions", []): result = RegionResult( x=r["x"], y=r["y"], w=r["w"], h=r["h"], confidence=r["confidence"], label=r["label"], ) results.append(result) return results def detect_edges_debug( self, image: np.ndarray, edge_canny_low: int = 50, edge_canny_high: int = 150, edge_hough_threshold: int = 80, edge_hough_min_length: int = 100, edge_hough_max_gap: int = 10, edge_pair_max_distance: int = 200, edge_pair_min_distance: int = 15, ) -> RegionDebugResult: """Run edge detection with debug overlays.""" payload = { "image": _encode_image(image), "edge_canny_low": edge_canny_low, "edge_canny_high": edge_canny_high, "edge_hough_threshold": edge_hough_threshold, "edge_hough_min_length": edge_hough_min_length, "edge_hough_max_gap": edge_hough_max_gap, "edge_pair_max_distance": edge_pair_max_distance, "edge_pair_min_distance": edge_pair_min_distance, } resp = self.session.post( f"{self.base_url}/detect_edges/debug", json=payload, timeout=self.timeout, ) resp.raise_for_status() data = resp.json() regions = [] for r in data.get("regions", []): region = RegionResult( x=r["x"], y=r["y"], w=r["w"], h=r["h"], confidence=r["confidence"], label=r["label"], ) regions.append(region) return RegionDebugResult( regions=regions, edge_overlay_b64=data.get("edge_overlay_b64", ""), lines_overlay_b64=data.get("lines_overlay_b64", ""), horizontal_count=data.get("horizontal_count", 0), pair_count=data.get("pair_count", 0), ) def load_model(self, model: str, quantization: str = "fp16") -> None: """Request the server to load a model into VRAM.""" self.session.post( f"{self.base_url}/models/load", json={"model": model, "quantization": quantization}, timeout=self.timeout, ).raise_for_status() def unload_model(self, model: str) -> None: """Request the server to unload a model from VRAM.""" self.session.post( f"{self.base_url}/models/unload", json={"model": model}, timeout=self.timeout, ).raise_for_status()