major refactor

This commit is contained in:
2026-03-27 06:02:58 -03:00
parent bcf6f3dc71
commit 51ce14a812
18 changed files with 351 additions and 523 deletions

View File

@@ -25,11 +25,10 @@ from .grpc import (
ProgressUpdate,
WorkerStatus,
)
from .job import (
Job, JobStatus, RunType,
Timeline, Checkpoint,
BrandSource, Brand,
)
from .job import Job, JobStatus, RunType
from .timeline import Timeline
from .checkpoint import Checkpoint
from .brand import BrandSource, Brand
from .media import AssetStatus, MediaAsset
from .presets import BUILTIN_PRESETS, TranscodePreset
from .detect import DETECT_VIEWS # noqa: F401 — discovered by modelgen generic loader

View File

@@ -0,0 +1,38 @@
"""Brand schema — source of truth for brand discovery."""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional
from uuid import UUID
class BrandSource(str, Enum):
OCR = "ocr"
VLM = "local_vlm"
CLOUD = "cloud_llm"
MANUAL = "manual"
@dataclass
class Brand:
"""
A brand discovered or registered in the system.
Airings track where/when the brand appeared — each airing
references a timeline and a frame range.
"""
id: UUID
canonical_name: str
aliases: List[str] = field(default_factory=list)
source: BrandSource = BrandSource.OCR # how first discovered
confirmed: bool = False
# Airings — JSONB array of appearances
# [{timeline_id, frame_start, frame_end, confidence, source, timestamp}]
airings: List[Dict[str, Any]] = field(default_factory=list)
total_airings: int = 0
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None

View File

@@ -0,0 +1,38 @@
"""Checkpoint schema — source of truth for pipeline state snapshots."""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, Optional
from uuid import UUID
@dataclass
class Checkpoint:
"""
A snapshot of pipeline state on a timeline.
Stage outputs stored as JSONB — each stage serializes to JSON,
the checkpoint stores it without knowing the shape.
parent_id forms a tree: multiple children from the same parent
= different config tries from the same starting point.
"""
id: UUID
timeline_id: UUID
parent_id: Optional[UUID] = None # null = root checkpoint
# Stage outputs — JSONB per stage, opaque to the checkpoint layer
stage_outputs: Dict[str, Any] = field(default_factory=dict)
# Config that produced this checkpoint
config_overrides: Dict[str, Any] = field(default_factory=dict)
# Pipeline state
stats: Dict[str, Any] = field(default_factory=dict)
# Scenario bookmark
is_scenario: bool = False
scenario_label: str = ""
created_at: Optional[datetime] = None

View File

@@ -1,177 +0,0 @@
"""
Detection Job and Checkpoint Schema Definitions
Source of truth for detection pipeline job tracking and stage checkpoints.
Follows the TranscodeJob/ChunkJob pattern.
"""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional
from uuid import UUID
class DetectJobStatus(str, Enum):
PENDING = "pending"
RUNNING = "running"
PAUSED = "paused"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
class RunType(str, Enum):
INITIAL = "initial"
REPLAY = "replay"
RETRY = "retry"
@dataclass
class DetectJob:
"""
A detection pipeline job.
Each invocation of the pipeline (initial run, replay, retry) creates a DetectJob.
Jobs for the same source video are linked via parent_job_id.
"""
id: UUID
# Input
source_asset_id: UUID
video_path: str
profile_name: str = "soccer_broadcast"
# Run lineage
parent_job_id: Optional[UUID] = None # links all runs for the same source
run_type: RunType = RunType.INITIAL
replay_from_stage: Optional[str] = None # null for initial runs
config_overrides: Dict[str, Any] = field(default_factory=dict)
# Status
status: DetectJobStatus = DetectJobStatus.PENDING
current_stage: Optional[str] = None
progress: float = 0.0
error_message: Optional[str] = None
# Results summary
total_detections: int = 0
brands_found: int = 0
cloud_llm_calls: int = 0
estimated_cost_usd: float = 0.0
# Worker tracking
celery_task_id: Optional[str] = None
priority: int = 0
# Timestamps
created_at: Optional[datetime] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
@dataclass
class Timeline:
"""
The frame sequence from a source video.
Independent of stages — exists before any stage runs.
Stages annotate the timeline, they don't own it.
Frames are stored in MinIO as JPEGs.
"""
id: UUID
source_asset_id: Optional[UUID] = None
source_video: str = ""
profile_name: str = ""
fps: float = 2.0
# Frame metadata (images in MinIO, metadata here)
frames_prefix: str = "" # s3: timelines/{id}/frames/
frames_manifest: Dict[int, str] = field(default_factory=dict) # seq → s3 key
frames_meta: List[Dict[str, Any]] = field(default_factory=list)
created_at: Optional[datetime] = None
@dataclass
class Checkpoint:
"""
A snapshot of pipeline state on a timeline.
Stage outputs stored as JSONB — each stage serializes to JSON,
the checkpoint stores it without knowing the shape.
parent_id forms a tree: multiple children from the same parent
= different config tries from the same starting point.
"""
id: UUID
timeline_id: UUID
parent_id: Optional[UUID] = None # null = root checkpoint
# Stage outputs — JSONB per stage, opaque to the checkpoint layer
stage_outputs: Dict[str, Any] = field(default_factory=dict)
# Config that produced this checkpoint
config_overrides: Dict[str, Any] = field(default_factory=dict)
# Pipeline state
stats: Dict[str, Any] = field(default_factory=dict)
# Scenario bookmark
is_scenario: bool = False
scenario_label: str = ""
created_at: Optional[datetime] = None
class BrandSource(str, Enum):
"""How a brand was first identified."""
OCR = "ocr"
VLM = "local_vlm"
CLOUD = "cloud_llm"
MANUAL = "manual" # user-added via UI
@dataclass
class KnownBrand:
"""
A brand discovered or registered in the system.
Global — not per-source. Accumulates across all pipeline runs.
Aliases enable fuzzy matching without re-escalating to VLM.
"""
id: UUID
canonical_name: str # normalized display name
aliases: List[str] = field(default_factory=list) # known spellings/variants
first_source: BrandSource = BrandSource.OCR
total_occurrences: int = 0
confirmed: bool = False # manually confirmed by user
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
@dataclass
class SourceBrandSighting:
"""
A brand seen in a specific source (video/asset).
Per-source session cache — avoids re-escalating the same brand
on subsequent frames or re-runs of the same source.
"""
id: UUID
source_asset_id: UUID # the video this sighting belongs to
brand_id: UUID # FK to KnownBrand
brand_name: str # denormalized for fast lookup
first_seen_timestamp: float = 0.0
last_seen_timestamp: float = 0.0
occurrences: int = 0
detection_source: BrandSource = BrandSource.OCR
avg_confidence: float = 0.0
created_at: Optional[datetime] = None

View File

@@ -1,14 +1,9 @@
"""
Job, Timeline, and Checkpoint Schema Definitions
Source of truth for pipeline jobs, timelines, and checkpoints.
Generates: SQLModel (core/db/models.py), TypeScript via modelgen.
"""
"""Job schema — source of truth for pipeline jobs."""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional
from typing import Any, Dict, Optional
from uuid import UUID
@@ -68,91 +63,3 @@ class Job:
created_at: Optional[datetime] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
@dataclass
class Timeline:
"""
The frame sequence from a source video.
Independent of stages — exists before any stage runs.
Frames stored in MinIO as JPEGs, metadata here.
One timeline per job.
"""
id: UUID
source_asset_id: Optional[UUID] = None
source_video: str = ""
profile_name: str = ""
fps: float = 2.0
frames_prefix: str = "" # s3: timeline/{id}/frames/
frames_manifest: Dict[int, str] = field(default_factory=dict) # seq → s3 key
frames_meta: List[Dict[str, Any]] = field(default_factory=list)
created_at: Optional[datetime] = None
@dataclass
class Checkpoint:
"""
A snapshot of pipeline state on a timeline.
Stage outputs stored as JSONB — each stage serializes to JSON,
the checkpoint stores it without knowing the shape.
parent_id forms a tree: multiple children from the same parent
= different config tries from the same starting point.
"""
id: UUID
timeline_id: UUID
parent_id: Optional[UUID] = None # null = root checkpoint
# Stage outputs — JSONB per stage, opaque to the checkpoint layer
stage_outputs: Dict[str, Any] = field(default_factory=dict)
# Config that produced this checkpoint
config_overrides: Dict[str, Any] = field(default_factory=dict)
# Pipeline state
stats: Dict[str, Any] = field(default_factory=dict)
# Scenario bookmark
is_scenario: bool = False
scenario_label: str = ""
created_at: Optional[datetime] = None
# --- Brands ---
class BrandSource(str, Enum):
OCR = "ocr"
VLM = "local_vlm"
CLOUD = "cloud_llm"
MANUAL = "manual"
@dataclass
class Brand:
"""
A brand discovered or registered in the system.
Airings track where/when the brand appeared — each airing
references a timeline and a frame range.
"""
id: UUID
canonical_name: str
aliases: List[str] = field(default_factory=list)
source: BrandSource = BrandSource.OCR # how first discovered
confirmed: bool = False
# Airings — JSONB array of appearances
# [{timeline_id, frame_start, frame_end, confidence, source, timestamp}]
airings: List[Dict[str, Any]] = field(default_factory=list)
total_airings: int = 0
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None

View File

@@ -1,13 +1,9 @@
"""
Detection pipeline runtime models.
These are the data structures that flow between LangGraph nodes.
They contain runtime types (np.ndarray) so they are NOT generated
by modelgen they live here for the schema to be the complete
map of the application, but modelgen skips them.
Wire-format models (SSE events) are in detect.py.
DB models (jobs, checkpoints) are in detect_jobs.py.
These are the data structures that flow between pipeline stages.
They contain runtime types (np.ndarray) so modelgen skips them
not generated to SQLModel or TypeScript.
"""
from __future__ import annotations
@@ -89,10 +85,3 @@ class DetectionReport:
brands: dict[str, BrandStats] = field(default_factory=dict)
timeline: list[BrandDetection] = field(default_factory=list)
pipeline_stats: PipelineStats = field(default_factory=PipelineStats)
# Not in DATACLASSES — modelgen skips these (they contain np.ndarray)
RUNTIME_MODELS = [
Frame, BoundingBox, TextCandidate, BrandDetection,
BrandStats, PipelineStats, DetectionReport,
]

View File

@@ -0,0 +1,29 @@
"""Timeline schema — source of truth for frame sequences."""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, List, Optional
from uuid import UUID
@dataclass
class Timeline:
"""
The frame sequence from a source video.
Independent of stages — exists before any stage runs.
Frames stored in MinIO as JPEGs, metadata here.
One timeline per job.
"""
id: UUID
source_asset_id: Optional[UUID] = None
source_video: str = ""
profile_name: str = ""
fps: float = 2.0
frames_prefix: str = "" # s3: timeline/{id}/frames/
frames_manifest: Dict[int, str] = field(default_factory=dict) # seq → s3 key
frames_meta: List[Dict[str, Any]] = field(default_factory=list)
created_at: Optional[datetime] = None

View File

@@ -1,8 +1,6 @@
"""
Serializers for detection pipeline runtime models.
Mirrors core/schema/models/detect_pipeline.py.
Special handling:
- Frame.image (np.ndarray S3, excluded from JSON)
- TextCandidate.frame (object ref frame_sequence integer)
@@ -13,7 +11,7 @@ from __future__ import annotations
import dataclasses
from core.schema.models.detect_pipeline import (
from core.schema.models.pipeline import (
BoundingBox,
BrandDetection,
BrandStats,
@@ -59,13 +57,12 @@ def deserialize_frames_with_download(meta: list[dict], manifest: dict, job_id: s
def serialize_text_candidate(tc: TextCandidate) -> dict:
bbox_dict = dataclasses.asdict(tc.bbox)
result = {
return {
"frame_sequence": tc.frame.sequence,
"bbox": bbox_dict,
"text": tc.text,
"ocr_confidence": tc.ocr_confidence,
}
return result
def serialize_text_candidates(candidates: list[TextCandidate]) -> list[dict]:
@@ -75,13 +72,12 @@ def serialize_text_candidates(candidates: list[TextCandidate]) -> list[dict]:
def deserialize_text_candidate(data: dict, frame_map: dict[int, Frame]) -> TextCandidate:
frame = frame_map[data["frame_sequence"]]
bbox = safe_construct(BoundingBox, data["bbox"])
candidate = TextCandidate(
return TextCandidate(
frame=frame,
bbox=bbox,
text=data["text"],
ocr_confidence=data["ocr_confidence"],
)
return candidate
def deserialize_text_candidates(data: list[dict], frame_map: dict[int, Frame]) -> list[TextCandidate]: