phase 6
This commit is contained in:
14
gpu/.env.template
Normal file
14
gpu/.env.template
Normal file
@@ -0,0 +1,14 @@
|
||||
# Inference server configuration
|
||||
HOST=0.0.0.0
|
||||
PORT=8000
|
||||
|
||||
# VRAM management
|
||||
VRAM_BUDGET_MB=10240
|
||||
STRATEGY=sequential # sequential | concurrent | auto
|
||||
|
||||
# Model defaults
|
||||
YOLO_MODEL=yolov8n.pt
|
||||
YOLO_CONFIDENCE=0.3
|
||||
|
||||
# Device
|
||||
DEVICE=auto # auto | cpu | cuda | cuda:0
|
||||
18
gpu/Dockerfile
Normal file
18
gpu/Dockerfile
Normal file
@@ -0,0 +1,18 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN pip install --no-cache-dir uv
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
libgl1 libglib2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN uv pip install --system --no-cache -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["python", "server.py"]
|
||||
4
gpu/requirements.txt
Normal file
4
gpu/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastapi>=0.109.0
|
||||
uvicorn[standard]>=0.27.0
|
||||
ultralytics>=8.0.0
|
||||
Pillow>=10.0.0
|
||||
54
gpu/run.sh
Executable file
54
gpu/run.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
# Run the inference server
|
||||
#
|
||||
# Usage:
|
||||
# ./run.sh # Local (pip install -r requirements.txt first)
|
||||
# ./run.sh docker # Docker (CPU)
|
||||
# ./run.sh docker-gpu # Docker with GPU
|
||||
# ./run.sh stop # Stop Docker container
|
||||
|
||||
set -e
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||
|
||||
# Load env (create from template if missing)
|
||||
if [ ! -f .env ]; then
|
||||
if [ -f .env.template ]; then
|
||||
cp .env.template .env
|
||||
echo "Created .env from template — edit as needed"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -f .env ]; then
|
||||
set -a
|
||||
source .env
|
||||
set +a
|
||||
fi
|
||||
|
||||
case "${1:-local}" in
|
||||
local)
|
||||
python server.py
|
||||
;;
|
||||
docker)
|
||||
docker build -t mpr-inference .
|
||||
ENV_FLAG=""; [ -f .env ] && ENV_FLAG="--env-file .env"
|
||||
docker run --rm -p "${PORT:-8000}:8000" \
|
||||
$ENV_FLAG \
|
||||
--name mpr-inference \
|
||||
mpr-inference
|
||||
;;
|
||||
docker-gpu)
|
||||
docker build -t mpr-inference .
|
||||
ENV_FLAG=""; [ -f .env ] && ENV_FLAG="--env-file .env"
|
||||
docker run --rm --gpus all -p "${PORT:-8000}:8000" \
|
||||
$ENV_FLAG \
|
||||
--name mpr-inference \
|
||||
mpr-inference
|
||||
;;
|
||||
stop)
|
||||
docker stop mpr-inference 2>/dev/null || true
|
||||
;;
|
||||
*)
|
||||
echo "Usage: ./run.sh [local|docker|docker-gpu|stop]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
206
gpu/server.py
Normal file
206
gpu/server.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
Inference server — thin HTTP wrapper around ML models.
|
||||
|
||||
Runs on the GPU machine. The detection pipeline calls this over HTTP,
|
||||
or imports the same logic locally if GPU is on the same machine.
|
||||
|
||||
Config is loaded from env on startup, then editable at runtime via
|
||||
GET/PUT /config. The UI config panel is just a visual editor for these
|
||||
same values.
|
||||
|
||||
Usage:
|
||||
cd gpu && uvicorn server:app --host 0.0.0.0 --port 8000
|
||||
# or
|
||||
cd gpu && python server.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import numpy as np
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Runtime config (loaded from env, mutable via API) ---
|
||||
_config = {
|
||||
"device": os.environ.get("DEVICE", "auto"),
|
||||
"yolo_model": os.environ.get("YOLO_MODEL", "yolov8n.pt"),
|
||||
"yolo_confidence": float(os.environ.get("YOLO_CONFIDENCE", "0.3")),
|
||||
"vram_budget_mb": int(os.environ.get("VRAM_BUDGET_MB", "10240")),
|
||||
"strategy": os.environ.get("STRATEGY", "sequential"),
|
||||
}
|
||||
|
||||
# --- Model registry ---
|
||||
_models: dict[str, object] = {}
|
||||
|
||||
|
||||
# --- Helpers ---
|
||||
|
||||
def _get_device() -> str:
|
||||
device = _config["device"]
|
||||
if device != "auto":
|
||||
return device
|
||||
try:
|
||||
import torch
|
||||
return "cuda" if torch.cuda.is_available() else "cpu"
|
||||
except ImportError:
|
||||
return "cpu"
|
||||
|
||||
|
||||
def _get_yolo(model_name: str | None = None):
|
||||
name = model_name or _config["yolo_model"]
|
||||
if name not in _models:
|
||||
from ultralytics import YOLO
|
||||
device = _get_device()
|
||||
logger.info("Loading %s on %s", name, device)
|
||||
model = YOLO(name)
|
||||
model.to(device)
|
||||
_models[name] = model
|
||||
return _models[name]
|
||||
|
||||
|
||||
def _decode_image(b64: str) -> np.ndarray:
|
||||
data = base64.b64decode(b64)
|
||||
img = Image.open(io.BytesIO(data)).convert("RGB")
|
||||
return np.array(img)
|
||||
|
||||
|
||||
# --- Request/Response models ---
|
||||
|
||||
class DetectRequest(BaseModel):
|
||||
image: str # base64 JPEG
|
||||
model: str | None = None # defaults to config yolo_model
|
||||
confidence: float | None = None # defaults to config yolo_confidence
|
||||
target_classes: list[str] | None = None
|
||||
|
||||
|
||||
class BBox(BaseModel):
|
||||
x: int
|
||||
y: int
|
||||
w: int
|
||||
h: int
|
||||
confidence: float
|
||||
label: str
|
||||
|
||||
|
||||
class DetectResponse(BaseModel):
|
||||
detections: list[BBox]
|
||||
|
||||
|
||||
class ConfigUpdate(BaseModel):
|
||||
"""Partial config update — only provided fields are changed."""
|
||||
device: str | None = None
|
||||
yolo_model: str | None = None
|
||||
yolo_confidence: float | None = None
|
||||
vram_budget_mb: int | None = None
|
||||
strategy: str | None = None
|
||||
|
||||
|
||||
# --- App ---
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
logger.info("Inference server starting (device=%s)", _get_device())
|
||||
yield
|
||||
logger.info("Inference server shutting down")
|
||||
_models.clear()
|
||||
|
||||
|
||||
app = FastAPI(title="MPR Inference Server", lifespan=lifespan)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {
|
||||
"status": "ok",
|
||||
"device": _get_device(),
|
||||
"loaded_models": list(_models.keys()),
|
||||
"vram_budget_mb": _config["vram_budget_mb"],
|
||||
"strategy": _config["strategy"],
|
||||
}
|
||||
|
||||
|
||||
@app.get("/config")
|
||||
def get_config():
|
||||
"""Current runtime config. Same values the .env sets at startup."""
|
||||
return {**_config, "device_resolved": _get_device()}
|
||||
|
||||
|
||||
@app.put("/config")
|
||||
def update_config(update: ConfigUpdate):
|
||||
"""Update runtime config. Only provided fields are changed."""
|
||||
changes = update.model_dump(exclude_none=True)
|
||||
if not changes:
|
||||
return _config
|
||||
|
||||
# If model changed, unload the old one so it gets reloaded on next request
|
||||
if "yolo_model" in changes and changes["yolo_model"] != _config["yolo_model"]:
|
||||
old = _config["yolo_model"]
|
||||
if old in _models:
|
||||
del _models[old]
|
||||
logger.info("Unloaded %s (model changed)", old)
|
||||
|
||||
_config.update(changes)
|
||||
logger.info("Config updated: %s", changes)
|
||||
return {**_config, "device_resolved": _get_device()}
|
||||
|
||||
|
||||
@app.post("/models/unload")
|
||||
def unload_model(body: dict):
|
||||
"""Unload a model from memory to free VRAM."""
|
||||
name = body.get("model", "")
|
||||
if name in _models:
|
||||
del _models[name]
|
||||
logger.info("Unloaded %s", name)
|
||||
return {"status": "unloaded", "model": name}
|
||||
return {"status": "not_loaded", "model": name}
|
||||
|
||||
|
||||
@app.post("/detect", response_model=DetectResponse)
|
||||
def detect(req: DetectRequest):
|
||||
model_name = req.model or _config["yolo_model"]
|
||||
confidence = req.confidence if req.confidence is not None else _config["yolo_confidence"]
|
||||
|
||||
try:
|
||||
model = _get_yolo(model_name)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to load model: {e}")
|
||||
|
||||
image = _decode_image(req.image)
|
||||
results = model(image, conf=confidence, verbose=False)
|
||||
|
||||
detections = []
|
||||
for r in results:
|
||||
for box in r.boxes:
|
||||
x1, y1, x2, y2 = box.xyxy[0].tolist()
|
||||
label = r.names[int(box.cls[0])]
|
||||
|
||||
if req.target_classes and label not in req.target_classes:
|
||||
continue
|
||||
|
||||
det = BBox(
|
||||
x=int(x1), y=int(y1),
|
||||
w=int(x2 - x1), h=int(y2 - y1),
|
||||
confidence=float(box.conf[0]),
|
||||
label=label,
|
||||
)
|
||||
detections.append(det)
|
||||
|
||||
return DetectResponse(detections=detections)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)-7s %(name)s — %(message)s")
|
||||
host = os.environ.get("HOST", "0.0.0.0")
|
||||
port = int(os.environ.get("PORT", "8000"))
|
||||
uvicorn.run(app, host=host, port=port)
|
||||
Reference in New Issue
Block a user