""" Detection Job and Checkpoint Schema Definitions Source of truth for detection pipeline job tracking and stage checkpoints. Follows the TranscodeJob/ChunkJob pattern. """ from dataclasses import dataclass, field from datetime import datetime from enum import Enum from typing import Any, Dict, List, Optional from uuid import UUID class DetectJobStatus(str, Enum): PENDING = "pending" RUNNING = "running" PAUSED = "paused" COMPLETED = "completed" FAILED = "failed" CANCELLED = "cancelled" class RunType(str, Enum): INITIAL = "initial" REPLAY = "replay" RETRY = "retry" @dataclass class DetectJob: """ A detection pipeline job. Each invocation of the pipeline (initial run, replay, retry) creates a DetectJob. Jobs for the same source video are linked via parent_job_id. """ id: UUID # Input source_asset_id: UUID video_path: str profile_name: str = "soccer_broadcast" # Run lineage parent_job_id: Optional[UUID] = None # links all runs for the same source run_type: RunType = RunType.INITIAL replay_from_stage: Optional[str] = None # null for initial runs config_overrides: Dict[str, Any] = field(default_factory=dict) # Status status: DetectJobStatus = DetectJobStatus.PENDING current_stage: Optional[str] = None progress: float = 0.0 error_message: Optional[str] = None # Results summary total_detections: int = 0 brands_found: int = 0 cloud_llm_calls: int = 0 estimated_cost_usd: float = 0.0 # Worker tracking celery_task_id: Optional[str] = None priority: int = 0 # Timestamps created_at: Optional[datetime] = None started_at: Optional[datetime] = None completed_at: Optional[datetime] = None @dataclass class StageCheckpoint: """ A checkpoint saved after a pipeline stage completes. Binary data (frame images, crops) goes to S3/MinIO. Everything else (structured state) lives here in Postgres. """ id: UUID job_id: UUID stage: str stage_index: int # position in NODES list (0-7) # S3 reference for binary data only frames_prefix: str = "" # s3 prefix: checkpoints/{job_id}/frames/ # Frame metadata (non-image fields) frames_manifest: Dict[int, str] = field(default_factory=dict) # seq → s3 key frames_meta: List[Dict[str, Any]] = field(default_factory=list) # sequence, chunk_id, timestamp, hash filtered_frame_sequences: List[int] = field(default_factory=list) # Stage output — stored as blob in MinIO: checkpoints/{job_id}/stages/{stage}.bson # Each stage's serialize_fn/deserialize_fn owns the format. # Postgres only stores the S3 key, not the data itself. stage_output_key: str = "" # s3 key to the serialized stage output # Pipeline state (small, stays in Postgres) stats: Dict[str, Any] = field(default_factory=dict) config_snapshot: Dict[str, Any] = field(default_factory=dict) config_overrides: Dict[str, Any] = field(default_factory=dict) # Input refs (for replay) video_path: str = "" profile_name: str = "" # Scenario — a checkpoint bookmarked for the editor workflow. # Created by seeders (manual scripts that populate state from real footage) # or captured from a running pipeline. Loaded via URL: # /detection/?job=#/editor/ is_scenario: bool = False scenario_label: str = "" # human-readable name, e.g. "chelsea_edges_lowcanny" # Timestamps created_at: Optional[datetime] = None class BrandSource(str, Enum): """How a brand was first identified.""" OCR = "ocr" VLM = "local_vlm" CLOUD = "cloud_llm" MANUAL = "manual" # user-added via UI @dataclass class KnownBrand: """ A brand discovered or registered in the system. Global — not per-source. Accumulates across all pipeline runs. Aliases enable fuzzy matching without re-escalating to VLM. """ id: UUID canonical_name: str # normalized display name aliases: List[str] = field(default_factory=list) # known spellings/variants first_source: BrandSource = BrandSource.OCR total_occurrences: int = 0 confirmed: bool = False # manually confirmed by user created_at: Optional[datetime] = None updated_at: Optional[datetime] = None @dataclass class SourceBrandSighting: """ A brand seen in a specific source (video/asset). Per-source session cache — avoids re-escalating the same brand on subsequent frames or re-runs of the same source. """ id: UUID source_asset_id: UUID # the video this sighting belongs to brand_id: UUID # FK to KnownBrand brand_name: str # denormalized for fast lookup first_seen_timestamp: float = 0.0 last_seen_timestamp: float = 0.0 occurrences: int = 0 detection_source: BrandSource = BrandSource.OCR avg_confidence: float = 0.0 created_at: Optional[datetime] = None