use sqlalchemy pattern

2026-03-27 05:19:45 -03:00
parent 291ac8dd40
commit bcf6f3dc71
14 changed files with 451 additions and 669 deletions
--- a/core/schema/models/init.py
+++ b/core/schema/models/init.py
@@ -25,28 +25,25 @@ from .grpc import (
    ProgressUpdate,
    WorkerStatus,
 )
-from .jobs import ChunkJob, ChunkJobStatus, JobStatus, TranscodeJob
-from .detect_jobs import (
-    DetectJob, DetectJobStatus, RunType,
+from .job import (
+    Job, JobStatus, RunType,
    Timeline, Checkpoint,
-    BrandSource, KnownBrand, SourceBrandSighting,
+    BrandSource, Brand,
 )
-from .stages import StageConfigField, StageIO, StageDefinition, STAGE_VIEWS
 from .media import AssetStatus, MediaAsset
 from .presets import BUILTIN_PRESETS, TranscodePreset
 from .detect import DETECT_VIEWS  # noqa: F401 — discovered by modelgen generic loader
 from .inference import INFERENCE_VIEWS  # noqa: F401 — GPU inference server API types
 from .ui_state import UI_STATE_VIEWS  # noqa: F401 — UI store state types
+from .stages import StageConfigField, StageIO, StageDefinition, STAGE_VIEWS  # noqa: F401
 from .views import ChunkEvent, ChunkOutputFile, PipelineStats, WorkerEvent
 from .sources import ChunkInfo, SourceJob, SourceType

-# Core domain models - generates Django, SQLModel, TypeScript
-DATACLASSES = [MediaAsset, TranscodePreset, TranscodeJob, ChunkJob,
-               DetectJob, Timeline, Checkpoint,
-               KnownBrand, SourceBrandSighting]
+# Core domain models - generates SQLModel, TypeScript
+DATACLASSES = [MediaAsset, TranscodePreset,
+               Job, Timeline, Checkpoint, Brand]

-# API request/response models - generates TypeScript only (no Django)
-# WorkerStatus from grpc.py is reused here
+# API request/response models
 API_MODELS = [
    CreateJobRequest,
    UpdateAssetRequest,
@@ -58,14 +55,13 @@ API_MODELS = [
    ChunkInfo,
 ]

-# Status enums - included in generated code
-ENUMS = [AssetStatus, JobStatus, ChunkJobStatus, DetectJobStatus, RunType, BrandSource, SourceType]
+# Status enums
+ENUMS = [AssetStatus, JobStatus, RunType, BrandSource, SourceType]

-# View/event models - generates TypeScript for UI consumption
+# View/event models
 VIEWS = [ChunkEvent, WorkerEvent, PipelineStats, ChunkOutputFile]

-
-# gRPC messages - generates Proto
+# gRPC messages
 GRPC_MESSAGES = [
    JobRequest,
    JobResponse,
@@ -83,18 +79,27 @@ __all__ = [
    # Models
    "MediaAsset",
    "TranscodePreset",
-    "TranscodeJob",
-    "ChunkJob",
-    # API Models
+    "Job",
+    "Timeline",
+    "Checkpoint",
+    "KnownBrand",
+    "SourceBrandSighting",
+    # Enums
+    "AssetStatus",
+    "JobStatus",
+    "RunType",
+    "BrandSource",
+    "SourceType",
+    # Stages
+    "StageConfigField",
+    "StageIO",
+    "StageDefinition",
+    # API
    "CreateJobRequest",
    "UpdateAssetRequest",
    "DeleteResult",
    "ScanResult",
    "SystemStatus",
-    # Enums
-    "AssetStatus",
-    "JobStatus",
-    "ChunkJobStatus",
    # gRPC
    "GRPC_SERVICE",
    "JobRequest",
@@ -113,7 +118,6 @@ __all__ = [
    "PipelineStats",
    "ChunkOutputFile",
    # Sources
-    "SourceType",
    "SourceJob",
    "ChunkInfo",
    # For generator
--- a/core/schema/models/job.py
+++ b/core/schema/models/job.py
@@ -0,0 +1,158 @@
+"""
+Job, Timeline, and Checkpoint Schema Definitions
+
+Source of truth for pipeline jobs, timelines, and checkpoints.
+Generates: SQLModel (core/db/models.py), TypeScript via modelgen.
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from uuid import UUID
+
+
+class JobStatus(str, Enum):
+    PENDING = "pending"
+    RUNNING = "running"
+    PAUSED = "paused"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+class RunType(str, Enum):
+    INITIAL = "initial"
+    REPLAY = "replay"
+    RETRY = "retry"
+
+
+@dataclass
+class Job:
+    """
+    A pipeline job.
+
+    Each invocation (initial run, replay, retry) creates a Job.
+    Jobs for the same source are linked via parent_id.
+    """
+
+    id: UUID
+
+    # Input
+    source_asset_id: UUID
+    video_path: str
+    profile_name: str = "soccer_broadcast"
+
+    # Lineage
+    parent_id: Optional[UUID] = None
+    run_type: RunType = RunType.INITIAL
+    config_overrides: Dict[str, Any] = field(default_factory=dict)
+
+    # Status
+    status: JobStatus = JobStatus.PENDING
+    current_stage: Optional[str] = None
+    progress: float = 0.0
+    error_message: Optional[str] = None
+
+    # Results summary
+    total_detections: int = 0
+    brands_found: int = 0
+    cloud_llm_calls: int = 0
+    estimated_cost_usd: float = 0.0
+
+    # Worker tracking
+    celery_task_id: Optional[str] = None
+    priority: int = 0
+
+    # Timestamps
+    created_at: Optional[datetime] = None
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+
+
+@dataclass
+class Timeline:
+    """
+    The frame sequence from a source video.
+
+    Independent of stages — exists before any stage runs.
+    Frames stored in MinIO as JPEGs, metadata here.
+    One timeline per job.
+    """
+
+    id: UUID
+    source_asset_id: Optional[UUID] = None
+    source_video: str = ""
+    profile_name: str = ""
+    fps: float = 2.0
+
+    frames_prefix: str = ""  # s3: timeline/{id}/frames/
+    frames_manifest: Dict[int, str] = field(default_factory=dict)  # seq → s3 key
+    frames_meta: List[Dict[str, Any]] = field(default_factory=list)
+
+    created_at: Optional[datetime] = None
+
+
+@dataclass
+class Checkpoint:
+    """
+    A snapshot of pipeline state on a timeline.
+
+    Stage outputs stored as JSONB — each stage serializes to JSON,
+    the checkpoint stores it without knowing the shape.
+
+    parent_id forms a tree: multiple children from the same parent
+    = different config tries from the same starting point.
+    """
+
+    id: UUID
+    timeline_id: UUID
+    parent_id: Optional[UUID] = None  # null = root checkpoint
+
+    # Stage outputs — JSONB per stage, opaque to the checkpoint layer
+    stage_outputs: Dict[str, Any] = field(default_factory=dict)
+
+    # Config that produced this checkpoint
+    config_overrides: Dict[str, Any] = field(default_factory=dict)
+
+    # Pipeline state
+    stats: Dict[str, Any] = field(default_factory=dict)
+
+    # Scenario bookmark
+    is_scenario: bool = False
+    scenario_label: str = ""
+
+    created_at: Optional[datetime] = None
+
+
+# --- Brands ---
+
+class BrandSource(str, Enum):
+    OCR = "ocr"
+    VLM = "local_vlm"
+    CLOUD = "cloud_llm"
+    MANUAL = "manual"
+
+
+@dataclass
+class Brand:
+    """
+    A brand discovered or registered in the system.
+
+    Airings track where/when the brand appeared — each airing
+    references a timeline and a frame range.
+    """
+
+    id: UUID
+    canonical_name: str
+    aliases: List[str] = field(default_factory=list)
+    source: BrandSource = BrandSource.OCR  # how first discovered
+    confirmed: bool = False
+
+    # Airings — JSONB array of appearances
+    # [{timeline_id, frame_start, frame_end, confidence, source, timestamp}]
+    airings: List[Dict[str, Any]] = field(default_factory=list)
+    total_airings: int = 0
+
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None