schema clean up and refactor

2026-03-26 05:14:33 -03:00
parent 08c58a6a9d
commit d58a90157a
17 changed files with 930 additions and 287 deletions
--- a/core/schema/models/detect_pipeline.py
+++ b/core/schema/models/detect_pipeline.py
@@ -0,0 +1,97 @@
+"""
+Detection pipeline runtime models.
+
+These are the data structures that flow between LangGraph nodes.
+They contain runtime types (np.ndarray) so they are NOT generated
+by modelgen — they live here for the schema to be the complete
+map of the application, but modelgen skips them.
+
+Wire-format models (SSE events) are in detect.py.
+DB models (jobs, checkpoints) are in detect_jobs.py.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Literal
+
+import numpy as np
+
+
+@dataclass
+class Frame:
+    sequence: int
+    chunk_id: int
+    timestamp: float  # position in video (seconds)
+    image: np.ndarray
+    perceptual_hash: str = ""
+
+
+@dataclass
+class BoundingBox:
+    x: int
+    y: int
+    w: int
+    h: int
+    confidence: float
+    label: str
+
+
+@dataclass
+class TextCandidate:
+    frame: Frame
+    bbox: BoundingBox
+    text: str
+    ocr_confidence: float
+
+
+@dataclass
+class BrandDetection:
+    brand: str
+    timestamp: float
+    duration: float
+    confidence: float
+    source: Literal["ocr", "local_vlm", "cloud_llm", "logo_match", "auxiliary"]
+    bbox: BoundingBox | None = None
+    frame_ref: int | None = None
+    content_type: str = ""
+
+
+@dataclass
+class BrandStats:
+    total_appearances: int = 0
+    total_screen_time: float = 0.0
+    avg_confidence: float = 0.0
+    first_seen: float = 0.0
+    last_seen: float = 0.0
+
+
+@dataclass
+class PipelineStats:
+    frames_extracted: int = 0
+    frames_after_scene_filter: int = 0
+    regions_detected: int = 0
+    regions_resolved_by_ocr: int = 0
+    regions_escalated_to_local_vlm: int = 0
+    regions_escalated_to_cloud_llm: int = 0
+    auxiliary_detections: int = 0
+    cloud_llm_calls: int = 0
+    processing_time_seconds: float = 0.0
+    estimated_cloud_cost_usd: float = 0.0
+
+
+@dataclass
+class DetectionReport:
+    video_source: str
+    content_type: str
+    duration_seconds: float
+    brands: dict[str, BrandStats] = field(default_factory=dict)
+    timeline: list[BrandDetection] = field(default_factory=list)
+    pipeline_stats: PipelineStats = field(default_factory=PipelineStats)
+
+
+# Not in DATACLASSES — modelgen skips these (they contain np.ndarray)
+RUNTIME_MODELS = [
+    Frame, BoundingBox, TextCandidate, BrandDetection,
+    BrandStats, PipelineStats, DetectionReport,
+]
--- a/core/schema/serializers/init.py
+++ b/core/schema/serializers/init.py
@@ -0,0 +1,11 @@
+"""
+Model serializers — one module per model group, mirroring core/schema/models/.
+
+    models/detect_pipeline.py  → serializers/detect_pipeline.py
+    models/detect_jobs.py      → serializers/detect_jobs.py
+    models/detect.py           → serializers/detect.py (SSE events)
+
+Common utilities in _common.py.
+"""
+
+from ._common import safe_construct, serialize_dataclass, serialize_dataclass_list
--- a/core/schema/serializers/_common.py
+++ b/core/schema/serializers/_common.py
@@ -0,0 +1,38 @@
+"""Common serialization utilities."""
+
+from __future__ import annotations
+
+import dataclasses
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def safe_construct(cls, data: dict):
+    """
+    Construct a dataclass from a dict, tolerant of schema changes.
+
+    - Ignores keys not in the dataclass (field was removed)
+    - Uses defaults for missing keys (field was added)
+    - Logs at debug level for mismatches
+    """
+    field_names = {f.name for f in dataclasses.fields(cls)}
+
+    known = {}
+    for k, v in data.items():
+        if k in field_names:
+            known[k] = v
+        else:
+            logger.debug("Ignoring unknown field %s.%s", cls.__name__, k)
+
+    return cls(**known)
+
+
+def serialize_dataclass(obj) -> dict:
+    """Serialize any dataclass to dict via dataclasses.asdict()."""
+    return dataclasses.asdict(obj)
+
+
+def serialize_dataclass_list(items) -> list[dict]:
+    """Serialize a list of dataclasses."""
+    return [dataclasses.asdict(item) for item in items]
--- a/core/schema/serializers/detect_pipeline.py
+++ b/core/schema/serializers/detect_pipeline.py
@@ -0,0 +1,108 @@
+"""
+Serializers for detection pipeline runtime models.
+
+Mirrors core/schema/models/detect_pipeline.py.
+
+Special handling:
+  - Frame.image (np.ndarray → S3, excluded from JSON)
+  - TextCandidate.frame (object ref → frame_sequence integer)
+Everything else uses dataclasses.asdict() via safe_construct.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+
+from core.schema.models.detect_pipeline import (
+    BoundingBox,
+    BrandDetection,
+    BrandStats,
+    DetectionReport,
+    Frame,
+    PipelineStats,
+    TextCandidate,
+)
+from ._common import safe_construct, serialize_dataclass, serialize_dataclass_list
+
+
+# ---------------------------------------------------------------------------
+# Frame — image goes to S3 separately
+# ---------------------------------------------------------------------------
+
+def serialize_frame_meta(frame: Frame) -> dict:
+    """Serialize Frame metadata only (no image)."""
+    result = dataclasses.asdict(frame)
+    del result["image"]
+    return result
+
+
+def serialize_frames_with_upload(frames: list[Frame], job_id: str) -> tuple[list[dict], dict[int, str]]:
+    """Upload frame images to S3, return metadata + manifest."""
+    from detect.checkpoint.frames import save_frames
+
+    manifest = save_frames(job_id, frames)
+    meta = [serialize_frame_meta(f) for f in frames]
+    return meta, manifest
+
+
+def deserialize_frames_with_download(meta: list[dict], manifest: dict, job_id: str) -> list[Frame]:
+    """Load frames from S3 + metadata."""
+    from detect.checkpoint.frames import load_frames
+
+    int_manifest = {int(k): v for k, v in manifest.items()}
+    return load_frames(int_manifest, meta)
+
+
+# ---------------------------------------------------------------------------
+# TextCandidate — frame ref is an object, stored as sequence int
+# ---------------------------------------------------------------------------
+
+def serialize_text_candidate(tc: TextCandidate) -> dict:
+    bbox_dict = dataclasses.asdict(tc.bbox)
+    result = {
+        "frame_sequence": tc.frame.sequence,
+        "bbox": bbox_dict,
+        "text": tc.text,
+        "ocr_confidence": tc.ocr_confidence,
+    }
+    return result
+
+
+def serialize_text_candidates(candidates: list[TextCandidate]) -> list[dict]:
+    return [serialize_text_candidate(tc) for tc in candidates]
+
+
+def deserialize_text_candidate(data: dict, frame_map: dict[int, Frame]) -> TextCandidate:
+    frame = frame_map[data["frame_sequence"]]
+    bbox = safe_construct(BoundingBox, data["bbox"])
+    candidate = TextCandidate(
+        frame=frame,
+        bbox=bbox,
+        text=data["text"],
+        ocr_confidence=data["ocr_confidence"],
+    )
+    return candidate
+
+
+def deserialize_text_candidates(data: list[dict], frame_map: dict[int, Frame]) -> list[TextCandidate]:
+    return [deserialize_text_candidate(d, frame_map) for d in data]
+
+
+# ---------------------------------------------------------------------------
+# BoundingBox, BrandDetection, PipelineStats, etc — standard dataclasses
+# ---------------------------------------------------------------------------
+
+def deserialize_bounding_box(data: dict) -> BoundingBox:
+    return safe_construct(BoundingBox, data)
+
+
+def deserialize_brand_detection(data: dict) -> BrandDetection:
+    return safe_construct(BrandDetection, data)
+
+
+def deserialize_pipeline_stats(data: dict) -> PipelineStats:
+    return safe_construct(PipelineStats, data)
+
+
+def deserialize_detection_report(data: dict) -> DetectionReport:
+    return safe_construct(DetectionReport, data)