mediaproc/core/api/detect_sources.py

"""
Source browser for detection pipeline.

Lists available media sources from blob storage (MinIO).
All file-based sources go through MinIO — no host filesystem access.
The pipeline downloads chunks to a temp path before processing.

Source types (current and future):
  - chunk_job: pre-chunked segments in MinIO (current)
  - upload:    user-uploaded file, lands in MinIO via upload endpoint (future)
  - device:    local camera/capture card via ffmpeg, no MinIO (future)
  - stream:    RTMP/HLS URL via ffmpeg, no MinIO (future)

GET  /detect/sources                    — list chunk jobs from blob store
GET  /detect/sources/{job_id}/chunks    — list chunks for a specific job
POST /detect/run                        — launch pipeline on selected source
"""

from __future__ import annotations

import logging
import os
import threading
import uuid

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/detect", tags=["detect"])

# In-process pipeline tracking
_running_jobs: dict[str, "threading.Thread"] = {}
_cancelled_jobs: set[str] = set()


class ChunkInfo(BaseModel):
    filename: str
    key: str
    size_bytes: int


class SourceInfo(BaseModel):
    job_id: str
    source_type: str = "chunk_job"
    chunk_count: int
    total_bytes: int = 0


class RunRequest(BaseModel):
    video_path: str           # storage key
    profile_name: str = "soccer_broadcast"
    source_asset_id: str = ""
    checkpoint: bool = True
    skip_vlm: bool = False
    skip_cloud: bool = False
    log_level: str = "INFO"  # INFO | DEBUG


class RunResponse(BaseModel):
    status: str
    job_id: str
    video_path: str


# ---------------------------------------------------------------------------
# Source listing
# ---------------------------------------------------------------------------

def _list_sources() -> list[SourceInfo]:
    """List chunk jobs from blob storage."""
    from core.storage.blob import get_store

    store = get_store("out")
    try:
        objects = store.list(prefix="chunks/")
    except Exception as e:
        logger.warning("Failed to list blob sources: %s", e)
        return []

    jobs: dict[str, int] = {}
    job_bytes: dict[str, int] = {}
    for obj in objects:
        # Keys include store prefix: out/chunks/{job_id}/file.mp4
        # Strip prefix to get: chunks/{job_id}/file.mp4
        rel_key = obj.key.removeprefix(store.prefix)
        parts = rel_key.split("/")
        if len(parts) >= 3 and parts[0] == "chunks":
            job_id = parts[1]
            jobs[job_id] = jobs.get(job_id, 0) + 1
            job_bytes[job_id] = job_bytes.get(job_id, 0) + obj.size_bytes

    sources = []
    for job_id, count in sorted(jobs.items()):
        source = SourceInfo(
            job_id=job_id,
            source_type="chunk_job",
            chunk_count=count,
            total_bytes=job_bytes.get(job_id, 0),
        )
        sources.append(source)
    return sources


@router.get("/sources", response_model=list[SourceInfo])
def list_sources():
    """List available chunk jobs from blob storage."""
    return _list_sources()


@router.get("/sources/{source_job_id}/chunks", response_model=list[ChunkInfo])
def list_chunks(source_job_id: str):
    """List chunks for a specific source job."""
    from core.storage.blob import get_store

    store = get_store("out")
    try:
        objects = store.list(prefix=f"chunks/{source_job_id}/", extensions={".mp4"})
    except Exception as e:
        logger.warning("Failed to list chunks for %s: %s", source_job_id, e)
        raise HTTPException(status_code=503, detail=f"Blob storage unavailable: {e}")

    if not objects:
        raise HTTPException(status_code=404, detail=f"Source not found: {source_job_id}")

    chunks = []
    for obj in objects:
        info = ChunkInfo(filename=obj.filename, key=obj.key, size_bytes=obj.size_bytes)
        chunks.append(info)
    return sorted(chunks, key=lambda c: c.filename)


@router.get("/sources/{source_job_id}/chunks/{filename}/url")
def get_chunk_url(source_job_id: str, filename: str):
    """Return a presigned URL for previewing a chunk in the browser."""
    from core.storage.blob import get_store

    store = get_store("out")
    key = f"chunks/{source_job_id}/{filename}"
    try:
        url = store.get_url(key, expires=3600)
    except Exception as e:
        raise HTTPException(status_code=503, detail=f"Could not generate URL: {e}")
    return {"url": url}


# ---------------------------------------------------------------------------
# Run pipeline
# ---------------------------------------------------------------------------

def _resolve_video_path(video_path: str) -> str:
    """Download a chunk from blob storage to a temp file."""
    from core.storage.blob import get_store

    store = get_store("out")
    try:
        return store.download_to_temp(video_path)
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Failed to download chunk: {e}")


@router.post("/run", response_model=RunResponse)
def run_pipeline(req: RunRequest):
    """Launch a detection pipeline run on a source chunk."""
    from detect import emit
    from detect.graph import get_pipeline
    from detect.state import DetectState

    local_path = _resolve_video_path(req.video_path)
    job_id = str(uuid.uuid4())[:8]

    if req.skip_vlm:
        os.environ["SKIP_VLM"] = "1"
    elif "SKIP_VLM" in os.environ:
        del os.environ["SKIP_VLM"]

    if req.skip_cloud:
        os.environ["SKIP_CLOUD"] = "1"
    elif "SKIP_CLOUD" in os.environ:
        del os.environ["SKIP_CLOUD"]

    # Clear any stale events from a previous run with same job_id
    from core.events import _get_redis
    from detect.events import DETECT_EVENTS_PREFIX
    r = _get_redis()
    r.delete(f"{DETECT_EVENTS_PREFIX}:{job_id}")

    emit.set_run_context(
        run_id=job_id, parent_job_id=job_id, run_type="initial",
        log_level=req.log_level,
    )

    pipeline = get_pipeline(checkpoint=req.checkpoint)

    initial_state = DetectState(
        video_path=local_path,
        job_id=job_id,
        profile_name=req.profile_name,
        source_asset_id=req.source_asset_id,
    )

    import traceback

    from detect.graph import PipelineCancelled, set_cancel_check, clear_cancel_check

    set_cancel_check(job_id, lambda: job_id in _cancelled_jobs)

    def _run():
        try:
            emit.log(job_id, "Pipeline", "INFO",
                     f"Starting pipeline: {req.video_path} (profile={req.profile_name})")
            pipeline.invoke(initial_state)
            emit.log(job_id, "Pipeline", "INFO", "Pipeline completed successfully")
            emit.job_complete(job_id, {"status": "completed"})
        except PipelineCancelled:
            emit.log(job_id, "Pipeline", "INFO", "Pipeline cancelled")
            emit.job_complete(job_id, {"status": "cancelled"})
        except Exception as e:
            logger.exception("Pipeline run %s failed: %s", job_id, e)
            tb = traceback.format_exc()
            emit.log(job_id, "Pipeline", "ERROR", str(e))
            emit.log(job_id, "Pipeline", "DEBUG", tb)
            emit.job_complete(job_id, {"status": "failed", "error": str(e)})
        finally:
            _running_jobs.pop(job_id, None)
            _cancelled_jobs.discard(job_id)
            clear_cancel_check(job_id)
            emit.clear_run_context()

    thread = threading.Thread(target=_run, daemon=True, name=f"pipeline-{job_id}")
    _running_jobs[job_id] = thread
    thread.start()

    return RunResponse(status="started", job_id=job_id, video_path=req.video_path)


@router.post("/stop/{job_id}")
def stop_pipeline(job_id: str):
    """Stop a running pipeline. Signals cancellation; the thread checks on next stage."""
    from detect import emit

    if job_id not in _running_jobs:
        raise HTTPException(status_code=404, detail=f"No running pipeline: {job_id}")

    _cancelled_jobs.add(job_id)
    emit.log(job_id, "Pipeline", "INFO", "Stop requested — cancelling after current stage")
    return {"status": "stopping", "job_id": job_id}


@router.post("/clear/{job_id}")
def clear_pipeline(job_id: str):
    """Clear events for a job from Redis."""
    from core.events import _get_redis
    from detect.events import DETECT_EVENTS_PREFIX

    r = _get_redis()
    r.delete(f"{DETECT_EVENTS_PREFIX}:{job_id}")
    return {"status": "cleared", "job_id": job_id}