phase cv 0

2026-03-26 22:22:35 -03:00
parent beb0416280
commit 65814b5b9e
46 changed files with 2962 additions and 268 deletions
--- a/core/schema/modelgen.json
+++ b/core/schema/modelgen.json
@@ -40,6 +40,11 @@
      "target": "typescript",
      "output": "ui/detection-app/src/types/store-state.ts",
      "include": ["ui_state_views"]
+    },
+    {
+      "target": "pydantic",
+      "output": "gpu/models/inference_contract.py",
+      "include": ["inference_views"]
    }
  ]
 }
--- a/core/schema/models/init.py
+++ b/core/schema/models/init.py
@@ -33,6 +33,7 @@ from .detect_jobs import (
 from .media import AssetStatus, MediaAsset
 from .presets import BUILTIN_PRESETS, TranscodePreset
 from .detect import DETECT_VIEWS  # noqa: F401 — discovered by modelgen generic loader
+from .inference import INFERENCE_VIEWS  # noqa: F401 — GPU inference server API types
 from .ui_state import UI_STATE_VIEWS  # noqa: F401 — UI store state types
 from .views import ChunkEvent, ChunkOutputFile, PipelineStats, WorkerEvent
 from .sources import ChunkInfo, SourceJob, SourceType
--- a/core/schema/models/detect.py
+++ b/core/schema/models/detect.py
@@ -53,6 +53,7 @@ class BoundingBoxEvent:
    label: str
    resolved_brand: Optional[str] = None
    source: Optional[str] = None
+    stage: Optional[str] = None


@dataclass
@@ -85,6 +86,7 @@ class StatsUpdate:

    frames_extracted: int = 0
    frames_after_scene_filter: int = 0
+    cv_regions_detected: int = 0
    regions_detected: int = 0
    regions_resolved_by_ocr: int = 0
    regions_escalated_to_local_vlm: int = 0
@@ -166,6 +168,8 @@ class CheckpointInfo:
    """Available checkpoint for a stage."""

    stage: str
+    is_scenario: bool = False
+    scenario_label: str = ""


@dataclass
--- a/core/schema/models/detect_jobs.py
+++ b/core/schema/models/detect_jobs.py
@@ -93,13 +93,12 @@ class StageCheckpoint:
    frames_meta: List[Dict[str, Any]] = field(default_factory=list)  # sequence, chunk_id, timestamp, hash
    filtered_frame_sequences: List[int] = field(default_factory=list)

-    # Detection state (full structured data, not just summaries)
-    boxes_by_frame: Dict[str, List[Dict[str, Any]]] = field(default_factory=dict)
-    text_candidates: List[Dict[str, Any]] = field(default_factory=list)
-    unresolved_candidates: List[Dict[str, Any]] = field(default_factory=list)
-    detections: List[Dict[str, Any]] = field(default_factory=list)
+    # Stage output — stored as blob in MinIO: checkpoints/{job_id}/stages/{stage}.bson
+    # Each stage's serialize_fn/deserialize_fn owns the format.
+    # Postgres only stores the S3 key, not the data itself.
+    stage_output_key: str = ""  # s3 key to the serialized stage output

-    # Pipeline state
+    # Pipeline state (small, stays in Postgres)
    stats: Dict[str, Any] = field(default_factory=dict)
    config_snapshot: Dict[str, Any] = field(default_factory=dict)
    config_overrides: Dict[str, Any] = field(default_factory=dict)
@@ -108,6 +107,13 @@ class StageCheckpoint:
    video_path: str = ""
    profile_name: str = ""

+    # Scenario — a checkpoint bookmarked for the editor workflow.
+    # Created by seeders (manual scripts that populate state from real footage)
+    # or captured from a running pipeline. Loaded via URL:
+    #   /detection/?job=<job_id>&stage=<stage>&editor=true
+    is_scenario: bool = False
+    scenario_label: str = ""  # human-readable name, e.g. "chelsea_edges_lowcanny"
+
    # Timestamps
    created_at: Optional[datetime] = None

--- a/core/schema/models/detect_pipeline.py
+++ b/core/schema/models/detect_pipeline.py
@@ -70,6 +70,7 @@ class BrandStats:
 class PipelineStats:
    frames_extracted: int = 0
    frames_after_scene_filter: int = 0
+    cv_regions_detected: int = 0
    regions_detected: int = 0
    regions_resolved_by_ocr: int = 0
    regions_escalated_to_local_vlm: int = 0
--- a/core/schema/models/inference.py
+++ b/core/schema/models/inference.py
@@ -0,0 +1,197 @@
+"""
+Inference Server API Schema Definitions
+
+Source of truth for GPU inference server request/response types.
+Generates: Pydantic (gpu/models/inference_contract.py)
+
+These are the wire-format types for the HTTP API between the
+pipeline (detect/) and the inference server (gpu/).
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+
+# --- Object Detection (YOLO) ---
+
+
+@dataclass
+class DetectRequest:
+    """Request body for object detection."""
+
+    image: str  # base64 JPEG
+    model: Optional[str] = None
+    confidence: Optional[float] = None
+    target_classes: Optional[List[str]] = None
+
+
+@dataclass
+class BBox:
+    """A detected bounding box."""
+
+    x: int
+    y: int
+    w: int
+    h: int
+    confidence: float
+    label: str
+
+
+@dataclass
+class DetectResponse:
+    """Response from object detection."""
+
+    detections: List[BBox] = field(default_factory=list)
+
+
+# --- OCR ---
+
+
+@dataclass
+class OCRRequest:
+    """Request body for OCR."""
+
+    image: str  # base64 JPEG
+    languages: Optional[List[str]] = None
+
+
+@dataclass
+class OCRTextResult:
+    """A single OCR text extraction result."""
+
+    text: str
+    confidence: float
+    bbox: List[int] = field(default_factory=list)  # [x, y, w, h]
+
+
+@dataclass
+class OCRResponse:
+    """Response from OCR."""
+
+    results: List[OCRTextResult] = field(default_factory=list)
+
+
+# --- Preprocessing ---
+
+
+@dataclass
+class PreprocessRequest:
+    """Request body for image preprocessing."""
+
+    image: str  # base64 JPEG
+    binarize: bool = False
+    deskew: bool = False
+    contrast: bool = True
+
+
+@dataclass
+class PreprocessResponse:
+    """Response from preprocessing."""
+
+    image: str  # base64 JPEG of processed image
+
+
+# --- VLM ---
+
+
+@dataclass
+class VLMRequest:
+    """Request body for visual language model query."""
+
+    image: str  # base64 JPEG
+    prompt: str
+    model: Optional[str] = None
+
+
+@dataclass
+class VLMResponse:
+    """Response from VLM."""
+
+    brand: str
+    confidence: float
+    reasoning: str
+
+
+# --- CV Region Analysis ---
+
+
+@dataclass
+class AnalyzeRegionsRequest:
+    """Request body for CV region analysis."""
+
+    image: str  # base64 JPEG
+    # Edge detection (Canny + HoughLinesP)
+    edge_canny_low: int = 50
+    edge_canny_high: int = 150
+    edge_hough_threshold: int = 80
+    edge_hough_min_length: int = 100
+    edge_hough_max_gap: int = 10
+    edge_pair_max_distance: int = 200
+    edge_pair_min_distance: int = 15
+
+
+@dataclass
+class RegionBox:
+    """A candidate region from CV analysis."""
+
+    x: int
+    y: int
+    w: int
+    h: int
+    confidence: float
+    label: str
+
+
+@dataclass
+class AnalyzeRegionsResponse:
+    """Response from CV region analysis."""
+
+    regions: List[RegionBox] = field(default_factory=list)
+
+
+@dataclass
+class AnalyzeRegionsDebugResponse:
+    """Response from CV region analysis with debug overlays."""
+
+    regions: List[RegionBox] = field(default_factory=list)
+    edge_overlay_b64: str = ""       # Canny edge image as base64 JPEG
+    lines_overlay_b64: str = ""      # frame with Hough lines drawn
+    horizontal_count: int = 0
+    pair_count: int = 0
+
+
+# --- Server Config ---
+
+
+@dataclass
+class ConfigUpdate:
+    """Request body for updating server configuration."""
+
+    device: Optional[str] = None
+    yolo_model: Optional[str] = None
+    yolo_confidence: Optional[float] = None
+    vram_budget_mb: Optional[int] = None
+    strategy: Optional[str] = None
+    ocr_languages: Optional[List[str]] = None
+    ocr_min_confidence: Optional[float] = None
+
+
+# --- Export list for modelgen ---
+
+INFERENCE_VIEWS = [
+    DetectRequest,
+    BBox,
+    DetectResponse,
+    OCRRequest,
+    OCRTextResult,
+    OCRResponse,
+    PreprocessRequest,
+    PreprocessResponse,
+    VLMRequest,
+    VLMResponse,
+    AnalyzeRegionsRequest,
+    RegionBox,
+    AnalyzeRegionsResponse,
+    AnalyzeRegionsDebugResponse,
+    ConfigUpdate,
+]