phase cv 0

2026-03-26 22:22:35 -03:00
parent beb0416280
commit 65814b5b9e
46 changed files with 2962 additions and 268 deletions
--- a/detect/checkpoint/replay.py
+++ b/detect/checkpoint/replay.py
@@ -26,6 +26,7 @@ class OverrideProfile:
        {
            "frame_extraction": {"fps": 1.0},
            "scene_filter": {"hamming_threshold": 12},
+            "region_analysis": {"edge_canny_low": 30, "edge_canny_high": 120},
            "detection": {"confidence_threshold": 0.5},
            "ocr": {"languages": ["en", "es"], "min_confidence": 0.3},
            "resolver": {"fuzzy_threshold": 60},
@@ -52,6 +53,9 @@ class OverrideProfile:
    def scene_filter_config(self):
        return self._patch(self._base.scene_filter_config(), "scene_filter")

+    def region_analysis_config(self):
+        return self._patch(self._base.region_analysis_config(), "region_analysis")
+
    def detection_config(self):
        return self._patch(self._base.detection_config(), "detection")

@@ -130,3 +134,137 @@ def replay_from(
        emit.clear_run_context()

    return result
+
+
+def replay_single_stage(
+    job_id: str,
+    stage: str,
+    frame_refs: list[int] | None = None,
+    config_overrides: dict | None = None,
+    debug: bool = False,
+) -> dict:
+    """
+    Replay a single stage on specific frames (or all frames from checkpoint).
+
+    Fast path for interactive parameter tuning — runs only the target stage
+    function, not the full pipeline tail. Returns the stage output directly.
+
+    When debug=True and stage is detect_edges, returns additional overlay
+    data (Canny edges, Hough lines) for visual feedback in the editor.
+
+    For detect_edges: returns {"edge_regions_by_frame": {seq: [box, ...]}}
+    With debug=True, also returns {"debug": {seq: {edge_overlay_b64, lines_overlay_b64, ...}}}
+    """
+    if stage not in NODES:
+        raise ValueError(f"Unknown stage: {stage!r}. Options: {NODES}")
+
+    stage_idx = NODES.index(stage)
+    if stage_idx == 0:
+        raise ValueError("Cannot replay the first stage — just run the full pipeline")
+
+    previous_stage = NODES[stage_idx - 1]
+
+    available = list_checkpoints(job_id)
+    if previous_stage not in available:
+        raise ValueError(
+            f"No checkpoint for stage {previous_stage!r} (job {job_id}). "
+            f"Available: {available}"
+        )
+
+    logger.info("Single-stage replay: job %s, stage %s (loading checkpoint: %s, debug=%s)",
+                job_id, stage, previous_stage, debug)
+
+    state = load_checkpoint(job_id, previous_stage)
+
+    # Build profile with overrides
+    from detect.profiles import get_profile
+    profile = get_profile(state.get("profile_name", "soccer_broadcast"))
+    if config_overrides:
+        profile = OverrideProfile(profile, config_overrides)
+
+    # Run the stage function directly (not through the graph)
+    if stage == "detect_edges":
+        return _replay_detect_edges(state, profile, frame_refs, job_id, debug)
+    else:
+        raise ValueError(
+            f"Single-stage replay not yet implemented for {stage!r}. "
+            f"Use replay_from() for full pipeline replay."
+        )
+
+
+def _replay_detect_edges(
+    state: dict,
+    profile,
+    frame_refs: list[int] | None,
+    job_id: str,
+    debug: bool,
+) -> dict:
+    """Run edge detection on checkpoint frames, optionally with debug overlays."""
+    import os
+    from detect.stages.edge_detector import detect_edge_regions
+
+    config = profile.region_analysis_config()
+    frames = state.get("filtered_frames", [])
+
+    if frame_refs:
+        ref_set = set(frame_refs)
+        frames = [f for f in frames if f.sequence in ref_set]
+
+    inference_url = os.environ.get("INFERENCE_URL")
+
+    # Normal run — always needed for the boxes
+    result = detect_edge_regions(
+        frames=frames,
+        config=config,
+        inference_url=inference_url,
+        job_id=job_id,
+    )
+    output = {"edge_regions_by_frame": result}
+
+    # Debug overlays — call debug endpoint (remote) or local debug function
+    if debug and frames:
+        debug_data = {}
+        if inference_url:
+            from detect.inference import InferenceClient
+            client = InferenceClient(base_url=inference_url, job_id=job_id)
+            for frame in frames:
+                dr = client.detect_edges_debug(
+                    image=frame.image,
+                    edge_canny_low=config.edge_canny_low,
+                    edge_canny_high=config.edge_canny_high,
+                    edge_hough_threshold=config.edge_hough_threshold,
+                    edge_hough_min_length=config.edge_hough_min_length,
+                    edge_hough_max_gap=config.edge_hough_max_gap,
+                    edge_pair_max_distance=config.edge_pair_max_distance,
+                    edge_pair_min_distance=config.edge_pair_min_distance,
+                )
+                debug_data[frame.sequence] = {
+                    "edge_overlay_b64": dr.edge_overlay_b64,
+                    "lines_overlay_b64": dr.lines_overlay_b64,
+                    "horizontal_count": dr.horizontal_count,
+                    "pair_count": dr.pair_count,
+                }
+        else:
+            # Local mode — import GPU module directly
+            from detect.stages.edge_detector import _load_cv_edges
+            edges_mod = _load_cv_edges()
+            for frame in frames:
+                dr = edges_mod.detect_edges_debug(
+                    frame.image,
+                    canny_low=config.edge_canny_low,
+                    canny_high=config.edge_canny_high,
+                    hough_threshold=config.edge_hough_threshold,
+                    hough_min_length=config.edge_hough_min_length,
+                    hough_max_gap=config.edge_hough_max_gap,
+                    pair_max_distance=config.edge_pair_max_distance,
+                    pair_min_distance=config.edge_pair_min_distance,
+                )
+                debug_data[frame.sequence] = {
+                    "edge_overlay_b64": dr["edge_overlay_b64"],
+                    "lines_overlay_b64": dr["lines_overlay_b64"],
+                    "horizontal_count": dr["horizontal_count"],
+                    "pair_count": dr["pair_count"],
+                }
+        output["debug"] = debug_data
+
+    return output
--- a/detect/checkpoint/storage.py
+++ b/detect/checkpoint/storage.py
@@ -2,39 +2,19 @@
 Checkpoint storage — save/load stage state.

 Binary data (frame images) → S3/MinIO via frames.py
-Structured data (boxes, detections, stats, config) → Postgres via Django ORM
-
-Until the Django model is generated by modelgen, checkpoint data is stored
-as JSON in S3 as a fallback. Once DetectJob/StageCheckpoint models exist,
-this module switches to Postgres.
+Structured data (stage output, stats, config) → Postgres
 """

 from __future__ import annotations

-import json
 import logging
-import os
-import tempfile
-from pathlib import Path

-from .frames import save_frames, load_frames, BUCKET, CHECKPOINT_PREFIX
+from .frames import save_frames, load_frames, CHECKPOINT_PREFIX
 from .serializer import serialize_state, deserialize_state

 logger = logging.getLogger(__name__)


-def _has_db() -> bool:
-    """Check if Postgres is reachable."""
-    try:
-        from core.db.connection import get_session
-        from sqlmodel import text
-        with get_session() as session:
-            session.exec(text("SELECT 1"))
-        return True
-    except Exception:
-        return False
-
-
 # ---------------------------------------------------------------------------
 # Save
 # ---------------------------------------------------------------------------
@@ -45,34 +25,24 @@ def save_checkpoint(
    stage_index: int,
    state: dict,
    frames_manifest: dict[int, str] | None = None,
+    is_scenario: bool = False,
+    scenario_label: str = "",
 ) -> str:
    """
    Save a stage checkpoint.

    Saves frame images to S3 (if not already saved), then persists
-    structured state to Postgres (or S3 JSON fallback).
+    structured state to Postgres.

-    Returns the checkpoint identifier (DB id or S3 key).
+    Returns the checkpoint DB id.
    """
-    # Save frames to S3 if no manifest provided
+    from core.db.detect import save_stage_checkpoint
+
    if frames_manifest is None:
        all_frames = state.get("frames", [])
        frames_manifest = save_frames(job_id, all_frames)

    checkpoint_data = serialize_state(state, frames_manifest)
-
-    if _has_db():
-        checkpoint_id = _save_to_db(job_id, stage, stage_index, checkpoint_data)
-    else:
-        checkpoint_id = _save_to_s3(job_id, stage, checkpoint_data)
-
-    return checkpoint_id
-
-
-def _save_to_db(job_id: str, stage: str, stage_index: int, data: dict) -> str:
-    """Save checkpoint structured data to Postgres."""
-    from core.db.detect import save_stage_checkpoint
-
    frames_prefix = f"{CHECKPOINT_PREFIX}/{job_id}/frames/"

    checkpoint = save_stage_checkpoint(
@@ -80,44 +50,24 @@ def _save_to_db(job_id: str, stage: str, stage_index: int, data: dict) -> str:
        stage=stage,
        stage_index=stage_index,
        frames_prefix=frames_prefix,
-        frames_manifest=data.get("frames_manifest", {}),
-        frames_meta=data.get("frames_meta", []),
-        filtered_frame_sequences=data.get("filtered_frame_sequences", []),
-        boxes_by_frame=data.get("boxes_by_frame", {}),
-        text_candidates=data.get("text_candidates", []),
-        unresolved_candidates=data.get("unresolved_candidates", []),
-        detections=data.get("detections", []),
-        stats=data.get("stats", {}),
-        config_snapshot=data.get("config_overrides", {}),
-        config_overrides=data.get("config_overrides", {}),
-        video_path=data.get("video_path", ""),
-        profile_name=data.get("profile_name", ""),
+        frames_manifest=checkpoint_data.get("frames_manifest", {}),
+        frames_meta=checkpoint_data.get("frames_meta", []),
+        filtered_frame_sequences=checkpoint_data.get("filtered_frame_sequences", []),
+        stage_output_key=checkpoint_data.get("stage_output_key", ""),
+        stats=checkpoint_data.get("stats", {}),
+        config_snapshot=checkpoint_data.get("config_overrides", {}),
+        config_overrides=checkpoint_data.get("config_overrides", {}),
+        video_path=checkpoint_data.get("video_path", ""),
+        profile_name=checkpoint_data.get("profile_name", ""),
+        is_scenario=is_scenario,
+        scenario_label=scenario_label,
    )

-    logger.info("Checkpoint saved to DB: %s/%s (id=%s)", job_id, stage, checkpoint.id)
+    logger.info("Checkpoint saved: %s/%s (id=%s, scenario=%s)",
+                job_id, stage, checkpoint.id, is_scenario)
    return str(checkpoint.id)


-def _save_to_s3(job_id: str, stage: str, data: dict) -> str:
-    """Fallback: save checkpoint as JSON to S3 (before modelgen generates DB models)."""
-    from core.storage.s3 import upload_file
-
-    checkpoint_json = json.dumps(data, default=str)
-    key = f"{CHECKPOINT_PREFIX}/{job_id}/stages/{stage}.json"
-
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
-        tmp.write(checkpoint_json)
-        tmp_path = tmp.name
-
-    try:
-        upload_file(tmp_path, BUCKET, key)
-    finally:
-        os.unlink(tmp_path)
-
-    logger.info("Checkpoint saved to S3: s3://%s/%s", BUCKET, key)
-    return key
-
-
 # ---------------------------------------------------------------------------
 # Load
 # ---------------------------------------------------------------------------
@@ -125,30 +75,12 @@ def _save_to_s3(job_id: str, stage: str, data: dict) -> str:
 def load_checkpoint(job_id: str, stage: str) -> dict:
    """
    Load a stage checkpoint and reconstitute full DetectState.
-
-    Tries Postgres first, falls back to S3 JSON.
    """
-    if _has_db():
-        data = _load_from_db(job_id, stage)
-    else:
-        data = _load_from_s3(job_id, stage)
-
-    raw_manifest = data.get("frames_manifest", {})
-    manifest = {int(k): v for k, v in raw_manifest.items()}
-    frame_metadata = data.get("frames_meta", [])
-    frames = load_frames(manifest, frame_metadata)
-
-    state = deserialize_state(data, frames)
-
-    logger.info("Checkpoint loaded: %s/%s (%d frames)", job_id, stage, len(frames))
-    return state
-
-
-def _load_from_db(job_id: str, stage: str) -> dict:
-    """Load checkpoint data from Postgres via core/db."""
    from core.db.detect import get_stage_checkpoint

    checkpoint = get_stage_checkpoint(job_id, stage)
+    if not checkpoint:
+        raise ValueError(f"No checkpoint for {job_id}/{stage}")

    data = {
        "job_id": str(checkpoint.job_id),
@@ -158,28 +90,20 @@ def _load_from_db(job_id: str, stage: str) -> dict:
        "frames_manifest": checkpoint.frames_manifest,
        "frames_meta": checkpoint.frames_meta,
        "filtered_frame_sequences": checkpoint.filtered_frame_sequences,
-        "boxes_by_frame": checkpoint.boxes_by_frame,
-        "text_candidates": checkpoint.text_candidates,
-        "unresolved_candidates": checkpoint.unresolved_candidates,
-        "detections": checkpoint.detections,
+        "stage_output_key": checkpoint.stage_output_key,
        "stats": checkpoint.stats,
    }
-    return data

+    raw_manifest = data.get("frames_manifest", {})
+    manifest = {int(k): v for k, v in raw_manifest.items()}
+    frame_metadata = data.get("frames_meta", [])
+    frames = load_frames(manifest, frame_metadata)

-def _load_from_s3(job_id: str, stage: str) -> dict:
-    """Fallback: load checkpoint JSON from S3."""
-    from core.storage.s3 import download_to_temp
+    state = deserialize_state(data, frames)

-    key = f"{CHECKPOINT_PREFIX}/{job_id}/stages/{stage}.json"
-    tmp_path = download_to_temp(BUCKET, key)
-    try:
-        with open(tmp_path) as f:
-            data = json.load(f)
-    finally:
-        os.unlink(tmp_path)
-
-    return data
+    logger.info("Checkpoint loaded: %s/%s (%d frames, scenario=%s)",
+                job_id, stage, len(frames), checkpoint.is_scenario)
+    return state


 # ---------------------------------------------------------------------------
@@ -188,25 +112,5 @@ def _load_from_s3(job_id: str, stage: str) -> dict:

 def list_checkpoints(job_id: str) -> list[str]:
    """List available checkpoint stages for a job."""
-    if _has_db():
-        return _list_from_db(job_id)
-    return _list_from_s3(job_id)
-
-
-def _list_from_db(job_id: str) -> list[str]:
    from core.db.detect import list_stage_checkpoints
    return list_stage_checkpoints(job_id)
-
-
-def _list_from_s3(job_id: str) -> list[str]:
-    from core.storage.s3 import list_objects
-
-    prefix = f"{CHECKPOINT_PREFIX}/{job_id}/stages/"
-    objects = list_objects(BUCKET, prefix)
-
-    stages = []
-    for obj in objects:
-        name = Path(obj["key"]).stem
-        stages.append(name)
-
-    return stages