mediaproc/core/detect/models.py

"""
Detection pipeline runtime models.

These are the data structures that flow between pipeline stages.
They contain runtime types (np.ndarray) so they live here, not in
core/schema/models/ (which is for modelgen source of truth).
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Literal

import numpy as np


@dataclass
class Frame:
    sequence: int
    chunk_id: int
    timestamp: float  # position in video (seconds)
    image: np.ndarray
    perceptual_hash: str = ""


@dataclass
class BoundingBox:
    x: int
    y: int
    w: int
    h: int
    confidence: float
    label: str


@dataclass
class TextCandidate:
    frame: Frame
    bbox: BoundingBox
    text: str
    ocr_confidence: float


@dataclass
class BrandDetection:
    brand: str
    timestamp: float
    duration: float
    confidence: float
    source: Literal["ocr", "local_vlm", "cloud_llm", "logo_match", "auxiliary"]
    bbox: BoundingBox | None = None
    frame_ref: int | None = None
    content_type: str = ""


@dataclass
class BrandStats:
    total_appearances: int = 0
    total_screen_time: float = 0.0
    avg_confidence: float = 0.0
    first_seen: float = 0.0
    last_seen: float = 0.0


@dataclass
class PipelineStats:
    frames_extracted: int = 0
    frames_after_scene_filter: int = 0
    cv_regions_detected: int = 0
    regions_detected: int = 0
    regions_resolved_by_ocr: int = 0
    regions_escalated_to_local_vlm: int = 0
    regions_escalated_to_cloud_llm: int = 0
    auxiliary_detections: int = 0
    cloud_llm_calls: int = 0
    processing_time_seconds: float = 0.0
    estimated_cloud_cost_usd: float = 0.0


@dataclass
class DetectionReport:
    video_source: str
    content_type: str
    duration_seconds: float
    brands: dict[str, BrandStats] = field(default_factory=dict)
    timeline: list[BrandDetection] = field(default_factory=list)
    pipeline_stats: PipelineStats = field(default_factory=PipelineStats)


@dataclass
class CropContext:
    """Runtime type — holds image bytes for VLM prompts."""
    image: bytes
    surrounding_text: str = ""
    position_hint: str = ""