major refactor

2026-03-27 06:02:58 -03:00
parent bcf6f3dc71
commit 51ce14a812
18 changed files with 351 additions and 523 deletions
--- a/core/schema/models/init.py
+++ b/core/schema/models/init.py
@@ -25,11 +25,10 @@ from .grpc import (
    ProgressUpdate,
    WorkerStatus,
 )
-from .job import (
-    Job, JobStatus, RunType,
-    Timeline, Checkpoint,
-    BrandSource, Brand,
-)
+from .job import Job, JobStatus, RunType
+from .timeline import Timeline
+from .checkpoint import Checkpoint
+from .brand import BrandSource, Brand
 from .media import AssetStatus, MediaAsset
 from .presets import BUILTIN_PRESETS, TranscodePreset
 from .detect import DETECT_VIEWS  # noqa: F401 — discovered by modelgen generic loader
--- a/core/schema/models/brand.py
+++ b/core/schema/models/brand.py
@@ -0,0 +1,38 @@
+"""Brand schema — source of truth for brand discovery."""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from uuid import UUID
+
+
+class BrandSource(str, Enum):
+    OCR = "ocr"
+    VLM = "local_vlm"
+    CLOUD = "cloud_llm"
+    MANUAL = "manual"
+
+
+@dataclass
+class Brand:
+    """
+    A brand discovered or registered in the system.
+
+    Airings track where/when the brand appeared — each airing
+    references a timeline and a frame range.
+    """
+
+    id: UUID
+    canonical_name: str
+    aliases: List[str] = field(default_factory=list)
+    source: BrandSource = BrandSource.OCR  # how first discovered
+    confirmed: bool = False
+
+    # Airings — JSONB array of appearances
+    # [{timeline_id, frame_start, frame_end, confidence, source, timestamp}]
+    airings: List[Dict[str, Any]] = field(default_factory=list)
+    total_airings: int = 0
+
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
--- a/core/schema/models/checkpoint.py
+++ b/core/schema/models/checkpoint.py
@@ -0,0 +1,38 @@
+"""Checkpoint schema — source of truth for pipeline state snapshots."""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, Dict, Optional
+from uuid import UUID
+
+
+@dataclass
+class Checkpoint:
+    """
+    A snapshot of pipeline state on a timeline.
+
+    Stage outputs stored as JSONB — each stage serializes to JSON,
+    the checkpoint stores it without knowing the shape.
+
+    parent_id forms a tree: multiple children from the same parent
+    = different config tries from the same starting point.
+    """
+
+    id: UUID
+    timeline_id: UUID
+    parent_id: Optional[UUID] = None  # null = root checkpoint
+
+    # Stage outputs — JSONB per stage, opaque to the checkpoint layer
+    stage_outputs: Dict[str, Any] = field(default_factory=dict)
+
+    # Config that produced this checkpoint
+    config_overrides: Dict[str, Any] = field(default_factory=dict)
+
+    # Pipeline state
+    stats: Dict[str, Any] = field(default_factory=dict)
+
+    # Scenario bookmark
+    is_scenario: bool = False
+    scenario_label: str = ""
+
+    created_at: Optional[datetime] = None
--- a/core/schema/models/detect_jobs.py
+++ b/core/schema/models/detect_jobs.py
@@ -1,177 +0,0 @@
-"""
-Detection Job and Checkpoint Schema Definitions
-
-Source of truth for detection pipeline job tracking and stage checkpoints.
-Follows the TranscodeJob/ChunkJob pattern.
-"""
-
-from dataclasses import dataclass, field
-from datetime import datetime
-from enum import Enum
-from typing import Any, Dict, List, Optional
-from uuid import UUID
-
-
-class DetectJobStatus(str, Enum):
-    PENDING = "pending"
-    RUNNING = "running"
-    PAUSED = "paused"
-    COMPLETED = "completed"
-    FAILED = "failed"
-    CANCELLED = "cancelled"
-
-
-class RunType(str, Enum):
-    INITIAL = "initial"
-    REPLAY = "replay"
-    RETRY = "retry"
-
-
-@dataclass
-class DetectJob:
-    """
-    A detection pipeline job.
-
-    Each invocation of the pipeline (initial run, replay, retry) creates a DetectJob.
-    Jobs for the same source video are linked via parent_job_id.
-    """
-
-    id: UUID
-
-    # Input
-    source_asset_id: UUID
-    video_path: str
-    profile_name: str = "soccer_broadcast"
-
-    # Run lineage
-    parent_job_id: Optional[UUID] = None  # links all runs for the same source
-    run_type: RunType = RunType.INITIAL
-    replay_from_stage: Optional[str] = None  # null for initial runs
-    config_overrides: Dict[str, Any] = field(default_factory=dict)
-
-    # Status
-    status: DetectJobStatus = DetectJobStatus.PENDING
-    current_stage: Optional[str] = None
-    progress: float = 0.0
-    error_message: Optional[str] = None
-
-    # Results summary
-    total_detections: int = 0
-    brands_found: int = 0
-    cloud_llm_calls: int = 0
-    estimated_cost_usd: float = 0.0
-
-    # Worker tracking
-    celery_task_id: Optional[str] = None
-    priority: int = 0
-
-    # Timestamps
-    created_at: Optional[datetime] = None
-    started_at: Optional[datetime] = None
-    completed_at: Optional[datetime] = None
-
-
-@dataclass
-class Timeline:
-    """
-    The frame sequence from a source video.
-
-    Independent of stages — exists before any stage runs.
-    Stages annotate the timeline, they don't own it.
-    Frames are stored in MinIO as JPEGs.
-    """
-
-    id: UUID
-    source_asset_id: Optional[UUID] = None
-    source_video: str = ""
-    profile_name: str = ""
-    fps: float = 2.0
-
-    # Frame metadata (images in MinIO, metadata here)
-    frames_prefix: str = ""  # s3: timelines/{id}/frames/
-    frames_manifest: Dict[int, str] = field(default_factory=dict)  # seq → s3 key
-    frames_meta: List[Dict[str, Any]] = field(default_factory=list)
-
-    created_at: Optional[datetime] = None
-
-
-@dataclass
-class Checkpoint:
-    """
-    A snapshot of pipeline state on a timeline.
-
-    Stage outputs stored as JSONB — each stage serializes to JSON,
-    the checkpoint stores it without knowing the shape.
-
-    parent_id forms a tree: multiple children from the same parent
-    = different config tries from the same starting point.
-    """
-
-    id: UUID
-    timeline_id: UUID
-    parent_id: Optional[UUID] = None  # null = root checkpoint
-
-    # Stage outputs — JSONB per stage, opaque to the checkpoint layer
-    stage_outputs: Dict[str, Any] = field(default_factory=dict)
-
-    # Config that produced this checkpoint
-    config_overrides: Dict[str, Any] = field(default_factory=dict)
-
-    # Pipeline state
-    stats: Dict[str, Any] = field(default_factory=dict)
-
-    # Scenario bookmark
-    is_scenario: bool = False
-    scenario_label: str = ""
-
-    created_at: Optional[datetime] = None
-
-
-class BrandSource(str, Enum):
-    """How a brand was first identified."""
-    OCR = "ocr"
-    VLM = "local_vlm"
-    CLOUD = "cloud_llm"
-    MANUAL = "manual"  # user-added via UI
-
-
-@dataclass
-class KnownBrand:
-    """
-    A brand discovered or registered in the system.
-
-    Global — not per-source. Accumulates across all pipeline runs.
-    Aliases enable fuzzy matching without re-escalating to VLM.
-    """
-
-    id: UUID
-    canonical_name: str              # normalized display name
-    aliases: List[str] = field(default_factory=list)  # known spellings/variants
-    first_source: BrandSource = BrandSource.OCR
-    total_occurrences: int = 0
-    confirmed: bool = False          # manually confirmed by user
-
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-
-
-@dataclass
-class SourceBrandSighting:
-    """
-    A brand seen in a specific source (video/asset).
-
-    Per-source session cache — avoids re-escalating the same brand
-    on subsequent frames or re-runs of the same source.
-    """
-
-    id: UUID
-    source_asset_id: UUID            # the video this sighting belongs to
-    brand_id: UUID                   # FK to KnownBrand
-    brand_name: str                  # denormalized for fast lookup
-    first_seen_timestamp: float = 0.0
-    last_seen_timestamp: float = 0.0
-    occurrences: int = 0
-    detection_source: BrandSource = BrandSource.OCR
-    avg_confidence: float = 0.0
-
-    created_at: Optional[datetime] = None
--- a/core/schema/models/job.py
+++ b/core/schema/models/job.py
@@ -1,14 +1,9 @@
-"""
-Job, Timeline, and Checkpoint Schema Definitions
-
-Source of truth for pipeline jobs, timelines, and checkpoints.
-Generates: SQLModel (core/db/models.py), TypeScript via modelgen.
-"""
+"""Job schema — source of truth for pipeline jobs."""

 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 from uuid import UUID


@@ -68,91 +63,3 @@ class Job:
    created_at: Optional[datetime] = None
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None
-
-
-@dataclass
-class Timeline:
-    """
-    The frame sequence from a source video.
-
-    Independent of stages — exists before any stage runs.
-    Frames stored in MinIO as JPEGs, metadata here.
-    One timeline per job.
-    """
-
-    id: UUID
-    source_asset_id: Optional[UUID] = None
-    source_video: str = ""
-    profile_name: str = ""
-    fps: float = 2.0
-
-    frames_prefix: str = ""  # s3: timeline/{id}/frames/
-    frames_manifest: Dict[int, str] = field(default_factory=dict)  # seq → s3 key
-    frames_meta: List[Dict[str, Any]] = field(default_factory=list)
-
-    created_at: Optional[datetime] = None
-
-
-@dataclass
-class Checkpoint:
-    """
-    A snapshot of pipeline state on a timeline.
-
-    Stage outputs stored as JSONB — each stage serializes to JSON,
-    the checkpoint stores it without knowing the shape.
-
-    parent_id forms a tree: multiple children from the same parent
-    = different config tries from the same starting point.
-    """
-
-    id: UUID
-    timeline_id: UUID
-    parent_id: Optional[UUID] = None  # null = root checkpoint
-
-    # Stage outputs — JSONB per stage, opaque to the checkpoint layer
-    stage_outputs: Dict[str, Any] = field(default_factory=dict)
-
-    # Config that produced this checkpoint
-    config_overrides: Dict[str, Any] = field(default_factory=dict)
-
-    # Pipeline state
-    stats: Dict[str, Any] = field(default_factory=dict)
-
-    # Scenario bookmark
-    is_scenario: bool = False
-    scenario_label: str = ""
-
-    created_at: Optional[datetime] = None
-
-
-# --- Brands ---
-
-class BrandSource(str, Enum):
-    OCR = "ocr"
-    VLM = "local_vlm"
-    CLOUD = "cloud_llm"
-    MANUAL = "manual"
-
-
-@dataclass
-class Brand:
-    """
-    A brand discovered or registered in the system.
-
-    Airings track where/when the brand appeared — each airing
-    references a timeline and a frame range.
-    """
-
-    id: UUID
-    canonical_name: str
-    aliases: List[str] = field(default_factory=list)
-    source: BrandSource = BrandSource.OCR  # how first discovered
-    confirmed: bool = False
-
-    # Airings — JSONB array of appearances
-    # [{timeline_id, frame_start, frame_end, confidence, source, timestamp}]
-    airings: List[Dict[str, Any]] = field(default_factory=list)
-    total_airings: int = 0
-
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
--- a/core/schema/models/detect_pipeline.py
+++ b/core/schema/models/detect_pipeline.py
@@ -1,13 +1,9 @@
 """
 Detection pipeline runtime models.

-These are the data structures that flow between LangGraph nodes.
-They contain runtime types (np.ndarray) so they are NOT generated
-by modelgen — they live here for the schema to be the complete
-map of the application, but modelgen skips them.
-
-Wire-format models (SSE events) are in detect.py.
-DB models (jobs, checkpoints) are in detect_jobs.py.
+These are the data structures that flow between pipeline stages.
+They contain runtime types (np.ndarray) so modelgen skips them —
+not generated to SQLModel or TypeScript.
 """

 from __future__ import annotations
@@ -89,10 +85,3 @@ class DetectionReport:
    brands: dict[str, BrandStats] = field(default_factory=dict)
    timeline: list[BrandDetection] = field(default_factory=list)
    pipeline_stats: PipelineStats = field(default_factory=PipelineStats)
-
-
-# Not in DATACLASSES — modelgen skips these (they contain np.ndarray)
-RUNTIME_MODELS = [
-    Frame, BoundingBox, TextCandidate, BrandDetection,
-    BrandStats, PipelineStats, DetectionReport,
-]
--- a/core/schema/models/timeline.py
+++ b/core/schema/models/timeline.py
@@ -0,0 +1,29 @@
+"""Timeline schema — source of truth for frame sequences."""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+from uuid import UUID
+
+
+@dataclass
+class Timeline:
+    """
+    The frame sequence from a source video.
+
+    Independent of stages — exists before any stage runs.
+    Frames stored in MinIO as JPEGs, metadata here.
+    One timeline per job.
+    """
+
+    id: UUID
+    source_asset_id: Optional[UUID] = None
+    source_video: str = ""
+    profile_name: str = ""
+    fps: float = 2.0
+
+    frames_prefix: str = ""  # s3: timeline/{id}/frames/
+    frames_manifest: Dict[int, str] = field(default_factory=dict)  # seq → s3 key
+    frames_meta: List[Dict[str, Any]] = field(default_factory=list)
+
+    created_at: Optional[datetime] = None
--- a/core/schema/serializers/detect_pipeline.py
+++ b/core/schema/serializers/detect_pipeline.py
@@ -1,8 +1,6 @@
 """
 Serializers for detection pipeline runtime models.

-Mirrors core/schema/models/detect_pipeline.py.
-
 Special handling:
  - Frame.image (np.ndarray → S3, excluded from JSON)
  - TextCandidate.frame (object ref → frame_sequence integer)
@@ -13,7 +11,7 @@ from __future__ import annotations

 import dataclasses

-from core.schema.models.detect_pipeline import (
+from core.schema.models.pipeline import (
    BoundingBox,
    BrandDetection,
    BrandStats,
@@ -59,13 +57,12 @@ def deserialize_frames_with_download(meta: list[dict], manifest: dict, job_id: s

 def serialize_text_candidate(tc: TextCandidate) -> dict:
    bbox_dict = dataclasses.asdict(tc.bbox)
-    result = {
+    return {
        "frame_sequence": tc.frame.sequence,
        "bbox": bbox_dict,
        "text": tc.text,
        "ocr_confidence": tc.ocr_confidence,
    }
-    return result


 def serialize_text_candidates(candidates: list[TextCandidate]) -> list[dict]:
@@ -75,13 +72,12 @@ def serialize_text_candidates(candidates: list[TextCandidate]) -> list[dict]:
 def deserialize_text_candidate(data: dict, frame_map: dict[int, Frame]) -> TextCandidate:
    frame = frame_map[data["frame_sequence"]]
    bbox = safe_construct(BoundingBox, data["bbox"])
-    candidate = TextCandidate(
+    return TextCandidate(
        frame=frame,
        bbox=bbox,
        text=data["text"],
        ocr_confidence=data["ocr_confidence"],
    )
-    return candidate


 def deserialize_text_candidates(data: list[dict], frame_map: dict[int, Frame]) -> list[TextCandidate]: