schema clean up and refactor

2026-03-26 05:14:33 -03:00
parent 08c58a6a9d
commit d58a90157a
17 changed files with 930 additions and 287 deletions
--- a/core/schema/models/detect_pipeline.py
+++ b/core/schema/models/detect_pipeline.py
@@ -0,0 +1,97 @@
+"""
+Detection pipeline runtime models.
+
+These are the data structures that flow between LangGraph nodes.
+They contain runtime types (np.ndarray) so they are NOT generated
+by modelgen — they live here for the schema to be the complete
+map of the application, but modelgen skips them.
+
+Wire-format models (SSE events) are in detect.py.
+DB models (jobs, checkpoints) are in detect_jobs.py.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Literal
+
+import numpy as np
+
+
+@dataclass
+class Frame:
+    sequence: int
+    chunk_id: int
+    timestamp: float  # position in video (seconds)
+    image: np.ndarray
+    perceptual_hash: str = ""
+
+
+@dataclass
+class BoundingBox:
+    x: int
+    y: int
+    w: int
+    h: int
+    confidence: float
+    label: str
+
+
+@dataclass
+class TextCandidate:
+    frame: Frame
+    bbox: BoundingBox
+    text: str
+    ocr_confidence: float
+
+
+@dataclass
+class BrandDetection:
+    brand: str
+    timestamp: float
+    duration: float
+    confidence: float
+    source: Literal["ocr", "local_vlm", "cloud_llm", "logo_match", "auxiliary"]
+    bbox: BoundingBox | None = None
+    frame_ref: int | None = None
+    content_type: str = ""
+
+
+@dataclass
+class BrandStats:
+    total_appearances: int = 0
+    total_screen_time: float = 0.0
+    avg_confidence: float = 0.0
+    first_seen: float = 0.0
+    last_seen: float = 0.0
+
+
+@dataclass
+class PipelineStats:
+    frames_extracted: int = 0
+    frames_after_scene_filter: int = 0
+    regions_detected: int = 0
+    regions_resolved_by_ocr: int = 0
+    regions_escalated_to_local_vlm: int = 0
+    regions_escalated_to_cloud_llm: int = 0
+    auxiliary_detections: int = 0
+    cloud_llm_calls: int = 0
+    processing_time_seconds: float = 0.0
+    estimated_cloud_cost_usd: float = 0.0
+
+
+@dataclass
+class DetectionReport:
+    video_source: str
+    content_type: str
+    duration_seconds: float
+    brands: dict[str, BrandStats] = field(default_factory=dict)
+    timeline: list[BrandDetection] = field(default_factory=list)
+    pipeline_stats: PipelineStats = field(default_factory=PipelineStats)
+
+
+# Not in DATACLASSES — modelgen skips these (they contain np.ndarray)
+RUNTIME_MODELS = [
+    Frame, BoundingBox, TextCandidate, BrandDetection,
+    BrandStats, PipelineStats, DetectionReport,
+]