refactor stage 1

This commit is contained in:
2026-03-27 04:23:21 -03:00
parent df6bcb01e8
commit 291ac8dd40
14 changed files with 688 additions and 450 deletions

View File

@@ -72,49 +72,58 @@ class DetectJob:
@dataclass
class StageCheckpoint:
class Timeline:
"""
A checkpoint saved after a pipeline stage completes.
The frame sequence from a source video.
Binary data (frame images, crops) goes to S3/MinIO.
Everything else (structured state) lives here in Postgres.
Independent of stages — exists before any stage runs.
Stages annotate the timeline, they don't own it.
Frames are stored in MinIO as JPEGs.
"""
id: UUID
job_id: UUID
stage: str
stage_index: int # position in NODES list (0-7)
source_asset_id: Optional[UUID] = None
source_video: str = ""
profile_name: str = ""
fps: float = 2.0
# S3 reference for binary data only
frames_prefix: str = "" # s3 prefix: checkpoints/{job_id}/frames/
# Frame metadata (non-image fields)
# Frame metadata (images in MinIO, metadata here)
frames_prefix: str = "" # s3: timelines/{id}/frames/
frames_manifest: Dict[int, str] = field(default_factory=dict) # seq → s3 key
frames_meta: List[Dict[str, Any]] = field(default_factory=list) # sequence, chunk_id, timestamp, hash
filtered_frame_sequences: List[int] = field(default_factory=list)
frames_meta: List[Dict[str, Any]] = field(default_factory=list)
# Stage output — stored as blob in MinIO: checkpoints/{job_id}/stages/{stage}.bson
# Each stage's serialize_fn/deserialize_fn owns the format.
# Postgres only stores the S3 key, not the data itself.
stage_output_key: str = "" # s3 key to the serialized stage output
created_at: Optional[datetime] = None
# Pipeline state (small, stays in Postgres)
stats: Dict[str, Any] = field(default_factory=dict)
config_snapshot: Dict[str, Any] = field(default_factory=dict)
@dataclass
class Checkpoint:
"""
A snapshot of pipeline state on a timeline.
Stage outputs stored as JSONB — each stage serializes to JSON,
the checkpoint stores it without knowing the shape.
parent_id forms a tree: multiple children from the same parent
= different config tries from the same starting point.
"""
id: UUID
timeline_id: UUID
parent_id: Optional[UUID] = None # null = root checkpoint
# Stage outputs — JSONB per stage, opaque to the checkpoint layer
stage_outputs: Dict[str, Any] = field(default_factory=dict)
# Config that produced this checkpoint
config_overrides: Dict[str, Any] = field(default_factory=dict)
# Input refs (for replay)
video_path: str = ""
profile_name: str = ""
# Pipeline state
stats: Dict[str, Any] = field(default_factory=dict)
# Scenario — a checkpoint bookmarked for the editor workflow.
# Created by seeders (manual scripts that populate state from real footage)
# or captured from a running pipeline. Loaded via URL:
# /detection/?job=<job_id>#/editor/<stage>
# Scenario bookmark
is_scenario: bool = False
scenario_label: str = "" # human-readable name, e.g. "chelsea_edges_lowcanny"
scenario_label: str = ""
# Timestamps
created_at: Optional[datetime] = None