mediaproc/core/schema/models/detect_jobs.py

"""
Detection Job and Checkpoint Schema Definitions

Source of truth for detection pipeline job tracking and stage checkpoints.
Follows the TranscodeJob/ChunkJob pattern.
"""

from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional
from uuid import UUID


class DetectJobStatus(str, Enum):
    PENDING = "pending"
    RUNNING = "running"
    PAUSED = "paused"
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"


class RunType(str, Enum):
    INITIAL = "initial"
    REPLAY = "replay"
    RETRY = "retry"


@dataclass
class DetectJob:
    """
    A detection pipeline job.

    Each invocation of the pipeline (initial run, replay, retry) creates a DetectJob.
    Jobs for the same source video are linked via parent_job_id.
    """

    id: UUID

    # Input
    source_asset_id: UUID
    video_path: str
    profile_name: str = "soccer_broadcast"

    # Run lineage
    parent_job_id: Optional[UUID] = None  # links all runs for the same source
    run_type: RunType = RunType.INITIAL
    replay_from_stage: Optional[str] = None  # null for initial runs
    config_overrides: Dict[str, Any] = field(default_factory=dict)

    # Status
    status: DetectJobStatus = DetectJobStatus.PENDING
    current_stage: Optional[str] = None
    progress: float = 0.0
    error_message: Optional[str] = None

    # Results summary
    total_detections: int = 0
    brands_found: int = 0
    cloud_llm_calls: int = 0
    estimated_cost_usd: float = 0.0

    # Worker tracking
    celery_task_id: Optional[str] = None
    priority: int = 0

    # Timestamps
    created_at: Optional[datetime] = None
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None


@dataclass
class StageCheckpoint:
    """
    A checkpoint saved after a pipeline stage completes.

    Binary data (frame images, crops) goes to S3/MinIO.
    Everything else (structured state) lives here in Postgres.
    """

    id: UUID
    job_id: UUID
    stage: str
    stage_index: int  # position in NODES list (0-7)

    # S3 reference for binary data only
    frames_prefix: str = ""  # s3 prefix: checkpoints/{job_id}/frames/

    # Frame metadata (non-image fields)
    frames_manifest: Dict[int, str] = field(default_factory=dict)  # seq → s3 key
    frames_meta: List[Dict[str, Any]] = field(default_factory=list)  # sequence, chunk_id, timestamp, hash
    filtered_frame_sequences: List[int] = field(default_factory=list)

    # Stage output — stored as blob in MinIO: checkpoints/{job_id}/stages/{stage}.bson
    # Each stage's serialize_fn/deserialize_fn owns the format.
    # Postgres only stores the S3 key, not the data itself.
    stage_output_key: str = ""  # s3 key to the serialized stage output

    # Pipeline state (small, stays in Postgres)
    stats: Dict[str, Any] = field(default_factory=dict)
    config_snapshot: Dict[str, Any] = field(default_factory=dict)
    config_overrides: Dict[str, Any] = field(default_factory=dict)

    # Input refs (for replay)
    video_path: str = ""
    profile_name: str = ""

    # Scenario — a checkpoint bookmarked for the editor workflow.
    # Created by seeders (manual scripts that populate state from real footage)
    # or captured from a running pipeline. Loaded via URL:
    #   /detection/?job=<job_id>&stage=<stage>&editor=true
    is_scenario: bool = False
    scenario_label: str = ""  # human-readable name, e.g. "chelsea_edges_lowcanny"

    # Timestamps
    created_at: Optional[datetime] = None


class BrandSource(str, Enum):
    """How a brand was first identified."""
    OCR = "ocr"
    VLM = "local_vlm"
    CLOUD = "cloud_llm"
    MANUAL = "manual"  # user-added via UI


@dataclass
class KnownBrand:
    """
    A brand discovered or registered in the system.

    Global — not per-source. Accumulates across all pipeline runs.
    Aliases enable fuzzy matching without re-escalating to VLM.
    """

    id: UUID
    canonical_name: str              # normalized display name
    aliases: List[str] = field(default_factory=list)  # known spellings/variants
    first_source: BrandSource = BrandSource.OCR
    total_occurrences: int = 0
    confirmed: bool = False          # manually confirmed by user

    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None


@dataclass
class SourceBrandSighting:
    """
    A brand seen in a specific source (video/asset).

    Per-source session cache — avoids re-escalating the same brand
    on subsequent frames or re-runs of the same source.
    """

    id: UUID
    source_asset_id: UUID            # the video this sighting belongs to
    brand_id: UUID                   # FK to KnownBrand
    brand_name: str                  # denormalized for fast lookup
    first_seen_timestamp: float = 0.0
    last_seen_timestamp: float = 0.0
    occurrences: int = 0
    detection_source: BrandSource = BrandSource.OCR
    avg_confidence: float = 0.0

    created_at: Optional[datetime] = None