This commit is contained in:
2026-03-30 07:22:14 -03:00
parent d0707333fd
commit 4220b0418e
182 changed files with 3668 additions and 5231 deletions

View File

@@ -0,0 +1,4 @@
from .client import InferenceClient
from .types import DetectResult, OCRResult, VLMResult
__all__ = ["InferenceClient", "DetectResult", "OCRResult", "VLMResult"]

View File

@@ -0,0 +1,262 @@
"""
HTTP client for the inference server.
The pipeline stages call this instead of importing ML libraries directly.
The inference server runs on the GPU machine (or spot instance).
"""
from __future__ import annotations
import base64
import io
import logging
import os
import numpy as np
import requests
from PIL import Image
from .types import DetectResult, OCRResult, RegionDebugResult, RegionResult, ServerStatus, VLMResult
logger = logging.getLogger(__name__)
DEFAULT_URL = os.environ.get("INFERENCE_URL", "http://localhost:8000")
def _encode_image(image: np.ndarray) -> str:
"""Encode numpy array as base64 JPEG."""
img = Image.fromarray(image)
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=85)
return base64.b64encode(buf.getvalue()).decode()
class InferenceClient:
"""HTTP client for the GPU inference server."""
def __init__(self, base_url: str | None = None, timeout: float = 60.0,
job_id: str = "", log_level: str = "INFO"):
self.base_url = (base_url or DEFAULT_URL).rstrip("/")
self.timeout = timeout
self.job_id = job_id
self.log_level = log_level
self.session = requests.Session()
if job_id:
self.session.headers["X-Job-Id"] = job_id
self.session.headers["X-Log-Level"] = log_level
def health(self) -> ServerStatus:
"""Check server health and loaded models."""
resp = self.session.get(f"{self.base_url}/health", timeout=self.timeout)
resp.raise_for_status()
data = resp.json()
return ServerStatus(
loaded_models=data.get("loaded_models", []),
vram_used_mb=data.get("vram_used_mb", 0),
vram_budget_mb=data.get("vram_budget_mb", 0),
strategy=data.get("strategy", "sequential"),
)
def detect(
self,
image: np.ndarray,
model: str = "yolov8n",
confidence: float = 0.3,
target_classes: list[str] | None = None,
) -> list[DetectResult]:
"""Run object detection on an image."""
payload = {
"image": _encode_image(image),
"model": model,
"confidence": confidence,
}
if target_classes:
payload["target_classes"] = target_classes
resp = self.session.post(
f"{self.base_url}/detect",
json=payload,
timeout=self.timeout,
)
resp.raise_for_status()
results = []
for d in resp.json().get("detections", []):
result = DetectResult(
x=d["x"], y=d["y"], w=d["w"], h=d["h"],
confidence=d["confidence"], label=d["label"],
)
results.append(result)
return results
def ocr(
self,
image: np.ndarray,
languages: list[str] | None = None,
) -> list[OCRResult]:
"""Run OCR on an image region."""
payload = {
"image": _encode_image(image),
}
if languages:
payload["languages"] = languages
resp = self.session.post(
f"{self.base_url}/ocr",
json=payload,
timeout=self.timeout,
)
resp.raise_for_status()
results = []
for d in resp.json().get("results", []):
result = OCRResult(
text=d["text"],
confidence=d["confidence"],
bbox=tuple(d["bbox"]),
)
results.append(result)
return results
def vlm(
self,
image: np.ndarray,
prompt: str,
model: str = "moondream2",
) -> VLMResult:
"""Query a visual language model with an image crop + prompt."""
payload = {
"image": _encode_image(image),
"prompt": prompt,
"model": model,
}
resp = self.session.post(
f"{self.base_url}/vlm",
json=payload,
timeout=self.timeout,
)
resp.raise_for_status()
data = resp.json()
return VLMResult(
brand=data.get("brand", ""),
confidence=data.get("confidence", 0.0),
reasoning=data.get("reasoning", ""),
)
def detect_edges(
self,
image: np.ndarray,
edge_canny_low: int = 50,
edge_canny_high: int = 150,
edge_hough_threshold: int = 80,
edge_hough_min_length: int = 100,
edge_hough_max_gap: int = 10,
edge_pair_max_distance: int = 200,
edge_pair_min_distance: int = 15,
) -> list[RegionResult]:
"""Run edge detection on an image."""
payload = {
"image": _encode_image(image),
"edge_canny_low": edge_canny_low,
"edge_canny_high": edge_canny_high,
"edge_hough_threshold": edge_hough_threshold,
"edge_hough_min_length": edge_hough_min_length,
"edge_hough_max_gap": edge_hough_max_gap,
"edge_pair_max_distance": edge_pair_max_distance,
"edge_pair_min_distance": edge_pair_min_distance,
}
resp = self.session.post(
f"{self.base_url}/detect_edges",
json=payload,
timeout=self.timeout,
)
resp.raise_for_status()
results = []
for r in resp.json().get("regions", []):
result = RegionResult(
x=r["x"], y=r["y"], w=r["w"], h=r["h"],
confidence=r["confidence"], label=r["label"],
)
results.append(result)
return results
def detect_edges_debug(
self,
image: np.ndarray,
edge_canny_low: int = 50,
edge_canny_high: int = 150,
edge_hough_threshold: int = 80,
edge_hough_min_length: int = 100,
edge_hough_max_gap: int = 10,
edge_pair_max_distance: int = 200,
edge_pair_min_distance: int = 15,
) -> RegionDebugResult:
"""Run edge detection with debug overlays."""
payload = {
"image": _encode_image(image),
"edge_canny_low": edge_canny_low,
"edge_canny_high": edge_canny_high,
"edge_hough_threshold": edge_hough_threshold,
"edge_hough_min_length": edge_hough_min_length,
"edge_hough_max_gap": edge_hough_max_gap,
"edge_pair_max_distance": edge_pair_max_distance,
"edge_pair_min_distance": edge_pair_min_distance,
}
resp = self.session.post(
f"{self.base_url}/detect_edges/debug",
json=payload,
timeout=self.timeout,
)
resp.raise_for_status()
data = resp.json()
regions = []
for r in data.get("regions", []):
region = RegionResult(
x=r["x"], y=r["y"], w=r["w"], h=r["h"],
confidence=r["confidence"], label=r["label"],
)
regions.append(region)
return RegionDebugResult(
regions=regions,
edge_overlay_b64=data.get("edge_overlay_b64", ""),
lines_overlay_b64=data.get("lines_overlay_b64", ""),
horizontal_count=data.get("horizontal_count", 0),
pair_count=data.get("pair_count", 0),
)
def post(self, path: str, payload: dict) -> dict | None:
"""Generic POST to the inference server. Returns JSON response or None on error."""
try:
resp = self.session.post(
f"{self.base_url}{path}",
json=payload,
timeout=self.timeout,
)
resp.raise_for_status()
return resp.json()
except Exception as e:
logger.warning("Inference POST %s failed: %s", path, e)
return None
def load_model(self, model: str, quantization: str = "fp16") -> None:
"""Request the server to load a model into VRAM."""
self.session.post(
f"{self.base_url}/models/load",
json={"model": model, "quantization": quantization},
timeout=self.timeout,
).raise_for_status()
def unload_model(self, model: str) -> None:
"""Request the server to unload a model from VRAM."""
self.session.post(
f"{self.base_url}/models/unload",
json={"model": model},
timeout=self.timeout,
).raise_for_status()

View File

@@ -0,0 +1,76 @@
"""
Inference response types.
These are the shapes returned by the inference server.
Kept separate from core.detect.models to avoid coupling the
inference protocol to pipeline internals.
"""
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass
class DetectResult:
"""Single object detection from YOLO or similar."""
x: int
y: int
w: int
h: int
confidence: float
label: str
@dataclass
class OCRResult:
"""Text extracted from a region."""
text: str
confidence: float
bbox: tuple[int, int, int, int] # x, y, w, h
@dataclass
class VLMResult:
"""Visual language model response for a crop."""
brand: str
confidence: float
reasoning: str
@dataclass
class RegionResult:
"""A candidate region from CV analysis."""
x: int
y: int
w: int
h: int
confidence: float
label: str
@dataclass
class RegionDebugResult:
"""CV region analysis with debug overlays."""
regions: list[RegionResult] = field(default_factory=list)
edge_overlay_b64: str = ""
lines_overlay_b64: str = ""
horizontal_count: int = 0
pair_count: int = 0
@dataclass
class ModelInfo:
"""Info about a loaded model."""
name: str
vram_mb: float
quantization: str # fp32, fp16, int8, int4
@dataclass
class ServerStatus:
"""Inference server health response."""
loaded_models: list[ModelInfo] = field(default_factory=list)
vram_used_mb: float = 0.0
vram_budget_mb: float = 0.0
strategy: str = "sequential" # sequential, concurrent, auto