schema clean up and refactor
This commit is contained in:
97
core/schema/models/detect_pipeline.py
Normal file
97
core/schema/models/detect_pipeline.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""
|
||||
Detection pipeline runtime models.
|
||||
|
||||
These are the data structures that flow between LangGraph nodes.
|
||||
They contain runtime types (np.ndarray) so they are NOT generated
|
||||
by modelgen — they live here for the schema to be the complete
|
||||
map of the application, but modelgen skips them.
|
||||
|
||||
Wire-format models (SSE events) are in detect.py.
|
||||
DB models (jobs, checkpoints) are in detect_jobs.py.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@dataclass
|
||||
class Frame:
|
||||
sequence: int
|
||||
chunk_id: int
|
||||
timestamp: float # position in video (seconds)
|
||||
image: np.ndarray
|
||||
perceptual_hash: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class BoundingBox:
|
||||
x: int
|
||||
y: int
|
||||
w: int
|
||||
h: int
|
||||
confidence: float
|
||||
label: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextCandidate:
|
||||
frame: Frame
|
||||
bbox: BoundingBox
|
||||
text: str
|
||||
ocr_confidence: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class BrandDetection:
|
||||
brand: str
|
||||
timestamp: float
|
||||
duration: float
|
||||
confidence: float
|
||||
source: Literal["ocr", "local_vlm", "cloud_llm", "logo_match", "auxiliary"]
|
||||
bbox: BoundingBox | None = None
|
||||
frame_ref: int | None = None
|
||||
content_type: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class BrandStats:
|
||||
total_appearances: int = 0
|
||||
total_screen_time: float = 0.0
|
||||
avg_confidence: float = 0.0
|
||||
first_seen: float = 0.0
|
||||
last_seen: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineStats:
|
||||
frames_extracted: int = 0
|
||||
frames_after_scene_filter: int = 0
|
||||
regions_detected: int = 0
|
||||
regions_resolved_by_ocr: int = 0
|
||||
regions_escalated_to_local_vlm: int = 0
|
||||
regions_escalated_to_cloud_llm: int = 0
|
||||
auxiliary_detections: int = 0
|
||||
cloud_llm_calls: int = 0
|
||||
processing_time_seconds: float = 0.0
|
||||
estimated_cloud_cost_usd: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetectionReport:
|
||||
video_source: str
|
||||
content_type: str
|
||||
duration_seconds: float
|
||||
brands: dict[str, BrandStats] = field(default_factory=dict)
|
||||
timeline: list[BrandDetection] = field(default_factory=list)
|
||||
pipeline_stats: PipelineStats = field(default_factory=PipelineStats)
|
||||
|
||||
|
||||
# Not in DATACLASSES — modelgen skips these (they contain np.ndarray)
|
||||
RUNTIME_MODELS = [
|
||||
Frame, BoundingBox, TextCandidate, BrandDetection,
|
||||
BrandStats, PipelineStats, DetectionReport,
|
||||
]
|
||||
11
core/schema/serializers/__init__.py
Normal file
11
core/schema/serializers/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""
|
||||
Model serializers — one module per model group, mirroring core/schema/models/.
|
||||
|
||||
models/detect_pipeline.py → serializers/detect_pipeline.py
|
||||
models/detect_jobs.py → serializers/detect_jobs.py
|
||||
models/detect.py → serializers/detect.py (SSE events)
|
||||
|
||||
Common utilities in _common.py.
|
||||
"""
|
||||
|
||||
from ._common import safe_construct, serialize_dataclass, serialize_dataclass_list
|
||||
38
core/schema/serializers/_common.py
Normal file
38
core/schema/serializers/_common.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""Common serialization utilities."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def safe_construct(cls, data: dict):
|
||||
"""
|
||||
Construct a dataclass from a dict, tolerant of schema changes.
|
||||
|
||||
- Ignores keys not in the dataclass (field was removed)
|
||||
- Uses defaults for missing keys (field was added)
|
||||
- Logs at debug level for mismatches
|
||||
"""
|
||||
field_names = {f.name for f in dataclasses.fields(cls)}
|
||||
|
||||
known = {}
|
||||
for k, v in data.items():
|
||||
if k in field_names:
|
||||
known[k] = v
|
||||
else:
|
||||
logger.debug("Ignoring unknown field %s.%s", cls.__name__, k)
|
||||
|
||||
return cls(**known)
|
||||
|
||||
|
||||
def serialize_dataclass(obj) -> dict:
|
||||
"""Serialize any dataclass to dict via dataclasses.asdict()."""
|
||||
return dataclasses.asdict(obj)
|
||||
|
||||
|
||||
def serialize_dataclass_list(items) -> list[dict]:
|
||||
"""Serialize a list of dataclasses."""
|
||||
return [dataclasses.asdict(item) for item in items]
|
||||
108
core/schema/serializers/detect_pipeline.py
Normal file
108
core/schema/serializers/detect_pipeline.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""
|
||||
Serializers for detection pipeline runtime models.
|
||||
|
||||
Mirrors core/schema/models/detect_pipeline.py.
|
||||
|
||||
Special handling:
|
||||
- Frame.image (np.ndarray → S3, excluded from JSON)
|
||||
- TextCandidate.frame (object ref → frame_sequence integer)
|
||||
Everything else uses dataclasses.asdict() via safe_construct.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
|
||||
from core.schema.models.detect_pipeline import (
|
||||
BoundingBox,
|
||||
BrandDetection,
|
||||
BrandStats,
|
||||
DetectionReport,
|
||||
Frame,
|
||||
PipelineStats,
|
||||
TextCandidate,
|
||||
)
|
||||
from ._common import safe_construct, serialize_dataclass, serialize_dataclass_list
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Frame — image goes to S3 separately
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def serialize_frame_meta(frame: Frame) -> dict:
|
||||
"""Serialize Frame metadata only (no image)."""
|
||||
result = dataclasses.asdict(frame)
|
||||
del result["image"]
|
||||
return result
|
||||
|
||||
|
||||
def serialize_frames_with_upload(frames: list[Frame], job_id: str) -> tuple[list[dict], dict[int, str]]:
|
||||
"""Upload frame images to S3, return metadata + manifest."""
|
||||
from detect.checkpoint.frames import save_frames
|
||||
|
||||
manifest = save_frames(job_id, frames)
|
||||
meta = [serialize_frame_meta(f) for f in frames]
|
||||
return meta, manifest
|
||||
|
||||
|
||||
def deserialize_frames_with_download(meta: list[dict], manifest: dict, job_id: str) -> list[Frame]:
|
||||
"""Load frames from S3 + metadata."""
|
||||
from detect.checkpoint.frames import load_frames
|
||||
|
||||
int_manifest = {int(k): v for k, v in manifest.items()}
|
||||
return load_frames(int_manifest, meta)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TextCandidate — frame ref is an object, stored as sequence int
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def serialize_text_candidate(tc: TextCandidate) -> dict:
|
||||
bbox_dict = dataclasses.asdict(tc.bbox)
|
||||
result = {
|
||||
"frame_sequence": tc.frame.sequence,
|
||||
"bbox": bbox_dict,
|
||||
"text": tc.text,
|
||||
"ocr_confidence": tc.ocr_confidence,
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
def serialize_text_candidates(candidates: list[TextCandidate]) -> list[dict]:
|
||||
return [serialize_text_candidate(tc) for tc in candidates]
|
||||
|
||||
|
||||
def deserialize_text_candidate(data: dict, frame_map: dict[int, Frame]) -> TextCandidate:
|
||||
frame = frame_map[data["frame_sequence"]]
|
||||
bbox = safe_construct(BoundingBox, data["bbox"])
|
||||
candidate = TextCandidate(
|
||||
frame=frame,
|
||||
bbox=bbox,
|
||||
text=data["text"],
|
||||
ocr_confidence=data["ocr_confidence"],
|
||||
)
|
||||
return candidate
|
||||
|
||||
|
||||
def deserialize_text_candidates(data: list[dict], frame_map: dict[int, Frame]) -> list[TextCandidate]:
|
||||
return [deserialize_text_candidate(d, frame_map) for d in data]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# BoundingBox, BrandDetection, PipelineStats, etc — standard dataclasses
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def deserialize_bounding_box(data: dict) -> BoundingBox:
|
||||
return safe_construct(BoundingBox, data)
|
||||
|
||||
|
||||
def deserialize_brand_detection(data: dict) -> BrandDetection:
|
||||
return safe_construct(BrandDetection, data)
|
||||
|
||||
|
||||
def deserialize_pipeline_stats(data: dict) -> PipelineStats:
|
||||
return safe_construct(PipelineStats, data)
|
||||
|
||||
|
||||
def deserialize_detection_report(data: dict) -> DetectionReport:
|
||||
return safe_construct(DetectionReport, data)
|
||||
Reference in New Issue
Block a user