a

2026-03-30 09:53:10 -03:00
parent 4220b0418e
commit aac27b8504
32 changed files with 1068 additions and 329 deletions
--- a/core/schema/models/init.py
+++ b/core/schema/models/init.py
@@ -28,6 +28,7 @@ from .grpc import (
 from .job import Job, JobStatus, RunType
 from .timeline import Timeline
 from .checkpoint import Checkpoint
+from .stage_output import StageOutput
 from .brand import BrandSource, Brand
 from .media import AssetStatus, MediaAsset
 from .profile import Profile
@@ -41,7 +42,7 @@ from .source import ChunkInfo, SourceJob, SourceType

 # Core domain models - generates SQLModel, TypeScript
 DATACLASSES = [MediaAsset, TranscodePreset,
-               Job, Timeline, Checkpoint, Brand, Profile]
+               Job, Timeline, Checkpoint, StageOutput, Brand, Profile]

 # API request/response models
 API_MODELS = [
--- a/core/schema/models/checkpoint.py
+++ b/core/schema/models/checkpoint.py
@@ -11,25 +11,24 @@ class Checkpoint:
    """
    A snapshot of pipeline state on a timeline.

-    Stage outputs stored as JSONB — each stage serializes to JSON,
-    the checkpoint stores it without knowing the shape.
-
    parent_id forms a tree: multiple children from the same parent
    = different config tries from the same starting point.
+
+    Stage outputs are stored separately in StageOutput table,
+    not carried in the checkpoint itself.
    """

    id: UUID
    timeline_id: UUID
-    job_id: Optional[UUID] = None  # which job created this checkpoint
-    parent_id: Optional[UUID] = None  # null = root checkpoint
+    job_id: Optional[UUID] = None
+    parent_id: Optional[UUID] = None

-    # Stage outputs — JSONB per stage, opaque to the checkpoint layer
-    stage_outputs: Dict[str, Any] = field(default_factory=dict)
+    stage_name: str = ""  # which stage produced this checkpoint

    # Config that produced this checkpoint
    config_overrides: Dict[str, Any] = field(default_factory=dict)

-    # Pipeline state
+    # Pipeline stats at this point
    stats: Dict[str, Any] = field(default_factory=dict)

    # Scenario bookmark
--- a/core/schema/models/job.py
+++ b/core/schema/models/job.py
@@ -38,7 +38,7 @@ class Job:
    video_path: str
    profile_name: str = "soccer_broadcast"

-    # Timeline — set after frame extraction, or upfront for replay jobs
+    # Timeline — set at job creation (timeline exists before any job)
    timeline_id: Optional[UUID] = None

    # Lineage
--- a/core/schema/models/stage_output.py
+++ b/core/schema/models/stage_output.py
@@ -0,0 +1,27 @@
+"""StageOutput schema — per-stage result storage."""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, Dict, Optional
+from uuid import UUID
+
+
+@dataclass
+class StageOutput:
+    """
+    Output of a single stage within a job.
+
+    Flat table with composite unique (job_id, stage_name).
+    Upserted on each stage completion. Independently queryable —
+    "give me all edge detection outputs for this timeline."
+    """
+
+    id: UUID
+    job_id: UUID
+    timeline_id: UUID
+    stage_name: str
+    checkpoint_id: Optional[UUID] = None
+
+    output: Dict[str, Any] = field(default_factory=dict)
+
+    created_at: Optional[datetime] = None
--- a/core/schema/models/timeline.py
+++ b/core/schema/models/timeline.py
@@ -1,4 +1,4 @@
-"""Timeline schema — source of truth for frame sequences."""
+"""Timeline schema — source of truth for source material sequences."""

 from dataclasses import dataclass, field
 from datetime import datetime
@@ -9,21 +9,27 @@ from uuid import UUID
@dataclass
 class Timeline:
    """
-    The frame sequence from a source video.
+    A user-created selection of source material.

-    Independent of stages — exists before any stage runs.
-    Frames stored in MinIO as JPEGs, metadata here.
-    One timeline per job.
+    Exists before any job runs. Holds source references (chunk paths,
+    asset IDs) and extraction config.
+
+    Frame cache: extracted frames live at media/timelines/{id}/frames/
+    as JPEGs. Any job on this timeline reads from the cache. Cache is
+    rebuildable from chunks (clear + re-extract). For ephemeral sources
+    (streams), the cache is the only record.
+
+    Many jobs can work on the same timeline.
    """

    id: UUID
+    name: str = ""
    source_asset_id: Optional[UUID] = None
-    source_video: str = ""
+    chunk_paths: List[str] = field(default_factory=list)
    profile_name: str = ""
+    status: str = "created"  # created | cached | ready
    fps: float = 2.0
-
-    frames_prefix: str = ""  # s3: timeline/{id}/frames/
-    frames_manifest: Dict[int, str] = field(default_factory=dict)  # seq → s3 key
-    frames_meta: List[Dict[str, Any]] = field(default_factory=list)
+    frame_count: int = 0
+    source_ephemeral: bool = False  # True for streams — cache can't be rebuilt

    created_at: Optional[datetime] = None
--- a/core/schema/serializers/pipeline.py
+++ b/core/schema/serializers/pipeline.py
@@ -2,7 +2,7 @@
 Serializers for detection pipeline runtime models.

 Special handling:
-  - Frame.image (np.ndarray → S3, excluded from JSON)
+  - Frame.image (np.ndarray, ephemeral — only metadata serialized)
  - TextCandidate.frame (object ref → frame_sequence integer)
 Everything else uses dataclasses.asdict() via safe_construct.
 """
@@ -24,7 +24,7 @@ from ._common import safe_construct, serialize_dataclass, serialize_dataclass_li


 # ---------------------------------------------------------------------------
-# Frame — image goes to S3 separately
+# Frame — metadata only (image is ephemeral, re-extracted from chunks)
 # ---------------------------------------------------------------------------

 def serialize_frame_meta(frame: Frame) -> dict:
@@ -34,21 +34,9 @@ def serialize_frame_meta(frame: Frame) -> dict:
    return result


-def serialize_frames_with_upload(frames: list[Frame], job_id: str) -> tuple[list[dict], dict[int, str]]:
-    """Upload frame images to S3, return metadata + manifest."""
-    from core.detect.checkpoint.frames import save_frames
-
-    manifest = save_frames(job_id, frames)
-    meta = [serialize_frame_meta(f) for f in frames]
-    return meta, manifest
-
-
-def deserialize_frames_with_download(meta: list[dict], manifest: dict, job_id: str) -> list[Frame]:
-    """Load frames from S3 + metadata."""
-    from core.detect.checkpoint.frames import load_frames
-
-    int_manifest = {int(k): v for k, v in manifest.items()}
-    return load_frames(int_manifest, meta)
+def serialize_frames_meta(frames: list[Frame]) -> list[dict]:
+    """Serialize frame metadata for all frames."""
+    return [serialize_frame_meta(f) for f in frames]


 # ---------------------------------------------------------------------------