refactor storage minio for k8s

2026-03-26 09:20:23 -03:00
parent e27cb5bcc3
commit c9ba9e4f5f
22 changed files with 961 additions and 18 deletions
--- a/admin/mpr/media_assets/models.py
+++ b/admin/mpr/media_assets/models.py
@@ -47,6 +47,12 @@ class BrandSource(models.TextChoices):
    CLOUD = "cloud_llm", "Cloud"
    MANUAL = "manual", "Manual"

+class SourceType(models.TextChoices):
+    CHUNK_JOB = "chunk_job", "Chunk Job"
+    UPLOAD = "upload", "Upload"
+    DEVICE = "device", "Device"
+    STREAM = "stream", "Stream"
+
 class MediaAsset(models.Model):
    """A video/audio file registered in the system."""

@@ -268,3 +274,32 @@ class SourceBrandSighting(models.Model):
    def __str__(self):
        return str(self.id)

+
+class SourceJob(models.Model):
+    """A group of chunks that belong together (same source video/session)."""
+
+    job_id = models.CharField(max_length=255)
+    source_type = models.CharField(max_length=255)
+    chunk_count = models.IntegerField()
+    total_bytes = models.IntegerField(default=0)
+
+    class Meta:
+        pass
+
+    def __str__(self):
+        return str(self.id)
+
+
+class ChunkInfo(models.Model):
+    """A single chunk (video segment) stored in blob storage."""
+
+    filename = models.CharField(max_length=500)
+    key = models.CharField(max_length=255)
+    size_bytes = models.IntegerField()
+
+    class Meta:
+        pass
+
+    def __str__(self):
+        return self.filename
+
--- a/core/api/detect_sources.py
+++ b/core/api/detect_sources.py
@@ -0,0 +1,259 @@
+"""
+Source browser for detection pipeline.
+
+Lists available media sources from blob storage (MinIO).
+All file-based sources go through MinIO — no host filesystem access.
+The pipeline downloads chunks to a temp path before processing.
+
+Source types (current and future):
+  - chunk_job: pre-chunked segments in MinIO (current)
+  - upload:    user-uploaded file, lands in MinIO via upload endpoint (future)
+  - device:    local camera/capture card via ffmpeg, no MinIO (future)
+  - stream:    RTMP/HLS URL via ffmpeg, no MinIO (future)
+
+GET  /detect/sources                    — list chunk jobs from blob store
+GET  /detect/sources/{job_id}/chunks    — list chunks for a specific job
+POST /detect/run                        — launch pipeline on selected source
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import threading
+import uuid
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/detect", tags=["detect"])
+
+# In-process pipeline tracking
+_running_jobs: dict[str, "threading.Thread"] = {}
+_cancelled_jobs: set[str] = set()
+
+
+class ChunkInfo(BaseModel):
+    filename: str
+    key: str
+    size_bytes: int
+
+
+class SourceInfo(BaseModel):
+    job_id: str
+    source_type: str = "chunk_job"
+    chunk_count: int
+    total_bytes: int = 0
+
+
+class RunRequest(BaseModel):
+    video_path: str           # storage key
+    profile_name: str = "soccer_broadcast"
+    source_asset_id: str = ""
+    checkpoint: bool = True
+    skip_vlm: bool = False
+    skip_cloud: bool = False
+    log_level: str = "INFO"  # INFO | DEBUG
+
+
+class RunResponse(BaseModel):
+    status: str
+    job_id: str
+    video_path: str
+
+
+# ---------------------------------------------------------------------------
+# Source listing
+# ---------------------------------------------------------------------------
+
+def _list_sources() -> list[SourceInfo]:
+    """List chunk jobs from blob storage."""
+    from core.storage.blob import get_store
+
+    store = get_store("out")
+    try:
+        objects = store.list(prefix="chunks/")
+    except Exception as e:
+        logger.warning("Failed to list blob sources: %s", e)
+        return []
+
+    jobs: dict[str, int] = {}
+    job_bytes: dict[str, int] = {}
+    for obj in objects:
+        # Keys include store prefix: out/chunks/{job_id}/file.mp4
+        # Strip prefix to get: chunks/{job_id}/file.mp4
+        rel_key = obj.key.removeprefix(store.prefix)
+        parts = rel_key.split("/")
+        if len(parts) >= 3 and parts[0] == "chunks":
+            job_id = parts[1]
+            jobs[job_id] = jobs.get(job_id, 0) + 1
+            job_bytes[job_id] = job_bytes.get(job_id, 0) + obj.size_bytes
+
+    sources = []
+    for job_id, count in sorted(jobs.items()):
+        source = SourceInfo(
+            job_id=job_id,
+            source_type="chunk_job",
+            chunk_count=count,
+            total_bytes=job_bytes.get(job_id, 0),
+        )
+        sources.append(source)
+    return sources
+
+
+@router.get("/sources", response_model=list[SourceInfo])
+def list_sources():
+    """List available chunk jobs from blob storage."""
+    return _list_sources()
+
+
+@router.get("/sources/{source_job_id}/chunks", response_model=list[ChunkInfo])
+def list_chunks(source_job_id: str):
+    """List chunks for a specific source job."""
+    from core.storage.blob import get_store
+
+    store = get_store("out")
+    try:
+        objects = store.list(prefix=f"chunks/{source_job_id}/", extensions={".mp4"})
+    except Exception as e:
+        logger.warning("Failed to list chunks for %s: %s", source_job_id, e)
+        raise HTTPException(status_code=503, detail=f"Blob storage unavailable: {e}")
+
+    if not objects:
+        raise HTTPException(status_code=404, detail=f"Source not found: {source_job_id}")
+
+    chunks = []
+    for obj in objects:
+        info = ChunkInfo(filename=obj.filename, key=obj.key, size_bytes=obj.size_bytes)
+        chunks.append(info)
+    return sorted(chunks, key=lambda c: c.filename)
+
+
+@router.get("/sources/{source_job_id}/chunks/{filename}/url")
+def get_chunk_url(source_job_id: str, filename: str):
+    """Return a presigned URL for previewing a chunk in the browser."""
+    from core.storage.blob import get_store
+
+    store = get_store("out")
+    key = f"chunks/{source_job_id}/{filename}"
+    try:
+        url = store.get_url(key, expires=3600)
+    except Exception as e:
+        raise HTTPException(status_code=503, detail=f"Could not generate URL: {e}")
+    return {"url": url}
+
+
+# ---------------------------------------------------------------------------
+# Run pipeline
+# ---------------------------------------------------------------------------
+
+def _resolve_video_path(video_path: str) -> str:
+    """Download a chunk from blob storage to a temp file."""
+    from core.storage.blob import get_store
+
+    store = get_store("out")
+    try:
+        return store.download_to_temp(video_path)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to download chunk: {e}")
+
+
+@router.post("/run", response_model=RunResponse)
+def run_pipeline(req: RunRequest):
+    """Launch a detection pipeline run on a source chunk."""
+    from detect import emit
+    from detect.graph import get_pipeline
+    from detect.state import DetectState
+
+    local_path = _resolve_video_path(req.video_path)
+    job_id = str(uuid.uuid4())[:8]
+
+    if req.skip_vlm:
+        os.environ["SKIP_VLM"] = "1"
+    elif "SKIP_VLM" in os.environ:
+        del os.environ["SKIP_VLM"]
+
+    if req.skip_cloud:
+        os.environ["SKIP_CLOUD"] = "1"
+    elif "SKIP_CLOUD" in os.environ:
+        del os.environ["SKIP_CLOUD"]
+
+    # Clear any stale events from a previous run with same job_id
+    from core.events import _get_redis
+    from detect.events import DETECT_EVENTS_PREFIX
+    r = _get_redis()
+    r.delete(f"{DETECT_EVENTS_PREFIX}:{job_id}")
+
+    emit.set_run_context(
+        run_id=job_id, parent_job_id=job_id, run_type="initial",
+        log_level=req.log_level,
+    )
+
+    pipeline = get_pipeline(checkpoint=req.checkpoint)
+
+    initial_state = DetectState(
+        video_path=local_path,
+        job_id=job_id,
+        profile_name=req.profile_name,
+        source_asset_id=req.source_asset_id,
+    )
+
+    import traceback
+
+    from detect.graph import PipelineCancelled, set_cancel_check, clear_cancel_check
+
+    set_cancel_check(job_id, lambda: job_id in _cancelled_jobs)
+
+    def _run():
+        try:
+            emit.log(job_id, "Pipeline", "INFO",
+                     f"Starting pipeline: {req.video_path} (profile={req.profile_name})")
+            pipeline.invoke(initial_state)
+            emit.log(job_id, "Pipeline", "INFO", "Pipeline completed successfully")
+            emit.job_complete(job_id, {"status": "completed"})
+        except PipelineCancelled:
+            emit.log(job_id, "Pipeline", "INFO", "Pipeline cancelled")
+            emit.job_complete(job_id, {"status": "cancelled"})
+        except Exception as e:
+            logger.exception("Pipeline run %s failed: %s", job_id, e)
+            tb = traceback.format_exc()
+            emit.log(job_id, "Pipeline", "ERROR", str(e))
+            emit.log(job_id, "Pipeline", "DEBUG", tb)
+            emit.job_complete(job_id, {"status": "failed", "error": str(e)})
+        finally:
+            _running_jobs.pop(job_id, None)
+            _cancelled_jobs.discard(job_id)
+            clear_cancel_check(job_id)
+            emit.clear_run_context()
+
+    thread = threading.Thread(target=_run, daemon=True, name=f"pipeline-{job_id}")
+    _running_jobs[job_id] = thread
+    thread.start()
+
+    return RunResponse(status="started", job_id=job_id, video_path=req.video_path)
+
+
+@router.post("/stop/{job_id}")
+def stop_pipeline(job_id: str):
+    """Stop a running pipeline. Signals cancellation; the thread checks on next stage."""
+    from detect import emit
+
+    if job_id not in _running_jobs:
+        raise HTTPException(status_code=404, detail=f"No running pipeline: {job_id}")
+
+    _cancelled_jobs.add(job_id)
+    emit.log(job_id, "Pipeline", "INFO", "Stop requested — cancelling after current stage")
+    return {"status": "stopping", "job_id": job_id}
+
+
+@router.post("/clear/{job_id}")
+def clear_pipeline(job_id: str):
+    """Clear events for a job from Redis."""
+    from core.events import _get_redis
+    from detect.events import DETECT_EVENTS_PREFIX
+
+    r = _get_redis()
+    r.delete(f"{DETECT_EVENTS_PREFIX}:{job_id}")
+    return {"status": "cleared", "job_id": job_id}
--- a/core/api/main.py
+++ b/core/api/main.py
@@ -27,6 +27,7 @@ from core.api.chunker_sse import router as chunker_router
 from core.api.detect_sse import router as detect_router
 from core.api.detect_replay import router as detect_replay_router
 from core.api.detect_config import router as detect_config_router
+from core.api.detect_sources import router as detect_sources_router
 from core.api.graphql import schema as graphql_schema

 CALLBACK_API_KEY = os.environ.get("CALLBACK_API_KEY", "")
@@ -64,6 +65,9 @@ app.include_router(detect_replay_router)
 # Detection config
 app.include_router(detect_config_router)

+# Detection sources + run launcher
+app.include_router(detect_sources_router)
+

@app.get("/health")
 def health():
--- a/core/jobs/lambda_handler.py
+++ b/core/jobs/lambda_handler.py
@@ -20,8 +20,8 @@ logger = logging.getLogger()
 logger.setLevel(logging.INFO)

 # S3 config
-S3_BUCKET_IN = os.environ.get("S3_BUCKET_IN", "mpr-media-in")
-S3_BUCKET_OUT = os.environ.get("S3_BUCKET_OUT", "mpr-media-out")
+S3_BUCKET_IN = os.environ.get("S3_BUCKET_IN", "in")
+S3_BUCKET_OUT = os.environ.get("S3_BUCKET_OUT", "out")
 AWS_REGION = os.environ.get("AWS_REGION", "us-east-1")

 s3 = boto3.client("s3", region_name=AWS_REGION)
--- a/core/schema/models/init.py
+++ b/core/schema/models/init.py
@@ -35,10 +35,12 @@ from .presets import BUILTIN_PRESETS, TranscodePreset
 from .detect import DETECT_VIEWS  # noqa: F401 — discovered by modelgen generic loader
 from .ui_state import UI_STATE_VIEWS  # noqa: F401 — UI store state types
 from .views import ChunkEvent, ChunkOutputFile, PipelineStats, WorkerEvent
+from .sources import ChunkInfo, SourceJob, SourceType

 # Core domain models - generates Django, Pydantic, TypeScript
 DATACLASSES = [MediaAsset, TranscodePreset, TranscodeJob, ChunkJob,
-               DetectJob, StageCheckpoint, KnownBrand, SourceBrandSighting]
+               DetectJob, StageCheckpoint, KnownBrand, SourceBrandSighting,
+               SourceJob, ChunkInfo]

 # API request/response models - generates TypeScript only (no Django)
 # WorkerStatus from grpc.py is reused here
@@ -52,7 +54,7 @@ API_MODELS = [
 ]

 # Status enums - included in generated code
-ENUMS = [AssetStatus, JobStatus, ChunkJobStatus, DetectJobStatus, RunType, BrandSource]
+ENUMS = [AssetStatus, JobStatus, ChunkJobStatus, DetectJobStatus, RunType, BrandSource, SourceType]

 # View/event models - generates TypeScript for UI consumption
 VIEWS = [ChunkEvent, WorkerEvent, PipelineStats, ChunkOutputFile]
@@ -105,6 +107,10 @@ __all__ = [
    "WorkerEvent",
    "PipelineStats",
    "ChunkOutputFile",
+    # Sources
+    "SourceType",
+    "SourceJob",
+    "ChunkInfo",
    # For generator
    "DATACLASSES",
    "API_MODELS",
--- a/core/schema/models/sources.py
+++ b/core/schema/models/sources.py
@@ -0,0 +1,39 @@
+"""
+Media source models.
+
+Describes what types of sources the detection pipeline can process.
+Only chunk_job (blobs in MinIO) is implemented now — the rest are
+extension points with defined shapes.
+"""
+
+from dataclasses import dataclass, field
+from enum import Enum
+
+
+class SourceType(str, Enum):
+    CHUNK_JOB = "chunk_job"   # pre-chunked video segments in blob storage
+    UPLOAD    = "upload"      # future: user-uploaded file → MinIO → pipeline
+    DEVICE    = "device"      # future: local camera/capture card via ffmpeg (no MinIO)
+    STREAM    = "stream"      # future: RTMP/HLS URL via ffmpeg (no MinIO)
+
+
+@dataclass
+class ChunkInfo:
+    """A single chunk (video segment) stored in blob storage."""
+    filename: str
+    key: str          # storage key (MinIO object key)
+    size_bytes: int
+
+
+@dataclass
+class SourceJob:
+    """
+    A group of chunks that belong together (same source video/session).
+
+    Listed by the source selector so the user can pick a job,
+    then drill into its chunks.
+    """
+    job_id: str
+    source_type: str          # SourceType value
+    chunk_count: int
+    total_bytes: int = 0
--- a/core/storage/init.py
+++ b/core/storage/init.py
@@ -1,6 +1,5 @@
+from .blob import BUCKET, PREFIX_CHECKPOINTS, PREFIX_IN, PREFIX_OUT, BlobObject, BlobStore, get_store
 from .s3 import (
-    BUCKET_IN,
-    BUCKET_OUT,
    download_file,
    download_to_temp,
    get_presigned_url,
@@ -8,3 +7,8 @@ from .s3 import (
    list_objects,
    upload_file,
 )
+
+# Backward compat — old code uses BUCKET_IN / BUCKET_OUT as full bucket names.
+# Now they're one bucket; these exist so existing handlers don't break.
+BUCKET_IN = BUCKET
+BUCKET_OUT = BUCKET
--- a/core/storage/blob.py
+++ b/core/storage/blob.py
@@ -0,0 +1,112 @@
+"""
+Cloud-agnostic blob storage interface.
+
+All file-based sources (chunks, uploads, checkpoints) go through MinIO.
+Local dev runs MinIO in docker-compose — same code path as production.
+Production changes S3_ENDPOINT_URL; nothing else changes.
+
+Single bucket, multiple prefixes:
+  in/              — source media
+  out/             — transcoded chunks
+  checkpoints/     — detection intermediate blobs (frames, crops)
+
+Each prefix is independently configurable via env vars so they can
+be split into separate buckets later if needed.
+
+Nothing outside core/storage/ should import boto3 directly.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+
+# Single bucket, prefix-based layout
+BUCKET = os.environ.get("S3_BUCKET", "mpr")
+PREFIX_IN = os.environ.get("S3_PREFIX_IN", "in/")
+PREFIX_OUT = os.environ.get("S3_PREFIX_OUT", "out/")
+PREFIX_CHECKPOINTS = os.environ.get("S3_PREFIX_CHECKPOINTS", "checkpoints/")
+
+
+@dataclass
+class BlobObject:
+    key: str
+    filename: str
+    size_bytes: int
+
+
+class BlobStore:
+    """
+    Thin wrapper over the S3-compatible storage backend (MinIO / AWS S3).
+
+    All configuration (endpoint URL, credentials, region) is read from
+    environment variables by the underlying s3 module.
+    """
+
+    def __init__(self, bucket: str, prefix: str = ""):
+        self.bucket = bucket
+        self.prefix = prefix
+
+    def _full_prefix(self, prefix: str) -> str:
+        """Combine store prefix with caller prefix."""
+        return self.prefix + prefix
+
+    def list(
+        self,
+        prefix: str = "",
+        extensions: Optional[set[str]] = None,
+    ) -> list[BlobObject]:
+        """List objects in the bucket, optionally filtered by extension."""
+        from core.storage.s3 import list_objects
+
+        full = self._full_prefix(prefix)
+        raw = list_objects(self.bucket, prefix=full, extensions=extensions)
+        objects = []
+        for obj in raw:
+            blob = BlobObject(
+                key=obj["key"],
+                filename=obj["filename"],
+                size_bytes=obj["size"],
+            )
+            objects.append(blob)
+        return objects
+
+    def download_to_temp(self, key: str) -> str:
+        """Download a blob to a temp file. Caller is responsible for cleanup."""
+        from core.storage.s3 import download_to_temp
+
+        return download_to_temp(self.bucket, key)
+
+    def upload(self, local_path: str, key: str) -> None:
+        """Upload a local file to the bucket."""
+        from core.storage.s3 import upload_file
+
+        upload_file(local_path, self.bucket, key)
+
+    def get_url(self, key: str, expires: int = 3600) -> str:
+        """Return a presigned URL for the given key."""
+        from core.storage.s3 import get_presigned_url
+
+        return get_presigned_url(self.bucket, key, expires=expires)
+
+
+def get_store(purpose: str = "out") -> BlobStore:
+    """
+    Return a BlobStore for the given purpose.
+
+    Purposes map to prefixes:
+      "in"          → source media     (S3_PREFIX_IN)
+      "out"         → transcoded output (S3_PREFIX_OUT)
+      "checkpoints" → detection blobs   (S3_PREFIX_CHECKPOINTS)
+
+    All share the same bucket (S3_BUCKET), each scoped to its prefix.
+    """
+    prefix_map = {
+        "in": PREFIX_IN,
+        "out": PREFIX_OUT,
+        "checkpoints": PREFIX_CHECKPOINTS,
+    }
+    prefix = prefix_map.get(purpose, "")
+    return BlobStore(BUCKET, prefix=prefix)
--- a/core/storage/s3.py
+++ b/core/storage/s3.py
@@ -13,8 +13,8 @@ from typing import Optional
 import boto3
 from botocore.config import Config

-BUCKET_IN = os.environ.get("S3_BUCKET_IN", "mpr-media-in")
-BUCKET_OUT = os.environ.get("S3_BUCKET_OUT", "mpr-media-out")
+BUCKET_IN = os.environ.get("S3_BUCKET_IN", "in")
+BUCKET_OUT = os.environ.get("S3_BUCKET_OUT", "out")


 def get_s3_client():
--- a/ctrl/Dockerfile
+++ b/ctrl/Dockerfile
@@ -1,5 +1,7 @@
 FROM python:3.11-slim

+RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg && rm -rf /var/lib/apt/lists/*
+
 RUN pip install --no-cache-dir uv

 WORKDIR /app
--- a/ctrl/Tiltfile
+++ b/ctrl/Tiltfile
@@ -14,6 +14,7 @@ docker_build(
    'mpr-fastapi',
    context='..',
    dockerfile='Dockerfile',
+    ignore=['.git', 'def', 'docs', 'media', 'ui', 'gpu', 'modelgen', '.claude', 'tests'],
    live_update=[
        sync('..', '/app'),
    ],
--- a/ctrl/k8s/base/fastapi.yaml
+++ b/ctrl/k8s/base/fastapi.yaml
@@ -32,10 +32,10 @@ spec:
            periodSeconds: 10
          resources:
            requests:
-              memory: 128Mi
-              cpu: 100m
-            limits:
              memory: 512Mi
+              cpu: 500m
+            limits:
+              memory: 2Gi
 ---
 apiVersion: v1
 kind: Service
--- a/ctrl/k8s/base/minio.yaml
+++ b/ctrl/k8s/base/minio.yaml
@@ -5,8 +5,10 @@ metadata:
  namespace: mpr
 data:
  S3_ENDPOINT_URL: http://minio:9000
-  S3_BUCKET_IN: mpr-media-in
-  S3_BUCKET_OUT: mpr-media-out
+  S3_BUCKET: mpr
+  S3_PREFIX_IN: in/
+  S3_PREFIX_OUT: out/
+  S3_PREFIX_CHECKPOINTS: checkpoints/
  AWS_ACCESS_KEY_ID: minioadmin
  AWS_SECRET_ACCESS_KEY: minioadmin
  AWS_REGION: us-east-1
@@ -54,9 +56,7 @@ spec:
                  - -c
                  - |
                    sleep 3
-                    for bucket in mpr-media-in mpr-media-out; do
-                      mkdir -p /data/$bucket
-                    done
+                    mkdir -p /data/mpr/in /data/mpr/out /data/mpr/checkpoints
          volumeMounts:
            - name: data
              mountPath: /data
--- a/ctrl/k8s/kind-config.yaml.tpl
+++ b/ctrl/k8s/kind-config.yaml.tpl
@@ -0,0 +1,12 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: mpr
+nodes:
+  - role: control-plane
+    extraPortMappings:
+      - containerPort: 30080
+        hostPort: 80
+        protocol: TCP
+    extraMounts:
+      - hostPath: ${MEDIA_HOST_PATH}
+        containerPath: /mnt/media
--- a/ctrl/k8s/overlays/dev/kustomization.yaml
+++ b/ctrl/k8s/overlays/dev/kustomization.yaml
@@ -3,6 +3,7 @@ kind: Kustomization

 resources:
  - ../../base
+  - minio-pvc.yaml

 patches:
  # Gateway as NodePort for local access
@@ -28,3 +29,40 @@ patches:
      - op: add
        path: /spec/ports/0/nodePort
        value: 30379
+
+  # MinIO with persistent storage + host media mount for seeding.
+  # PV survives pod restarts. Host mount is read-only for mc mirror seeding.
+  # Requires kind cluster created with MEDIA_HOST_PATH extraMount (see kind-create.sh).
+  - target:
+      kind: Deployment
+      name: minio
+    patch: |
+      - op: replace
+        path: /spec/template/spec/volumes/0
+        value:
+          name: data
+          persistentVolumeClaim:
+            claimName: minio-data
+      - op: add
+        path: /spec/template/spec/containers/0/volumeMounts/-
+        value:
+          name: host-media
+          mountPath: /host-media
+          readOnly: true
+      - op: add
+        path: /spec/template/spec/volumes/-
+        value:
+          name: host-media
+          hostPath:
+            path: /mnt/media
+            type: DirectoryOrCreate
+      - op: replace
+        path: /spec/template/spec/containers/0/lifecycle/postStart/exec/command
+        value:
+          - /bin/sh
+          - -c
+          - |
+            until curl -sf http://localhost:9000/minio/health/live; do sleep 1; done
+            /usr/bin/mc alias set local http://localhost:9000 minioadmin minioadmin --quiet
+            /usr/bin/mc mb --ignore-existing local/mpr
+            /usr/bin/mc cp --recursive /host-media/mpr/out/ local/mpr/out/ --quiet || true
--- a/ctrl/k8s/overlays/dev/minio-pvc.yaml
+++ b/ctrl/k8s/overlays/dev/minio-pvc.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: minio-data
+  namespace: mpr
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 5Gi
--- a/ctrl/kind-create.sh
+++ b/ctrl/kind-create.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Create the kind cluster with host media mount.
+# Usage: MEDIA_HOST_PATH=/home/you/mpr/media ./kind-create.sh
+set -euo pipefail
+
+: "${MEDIA_HOST_PATH:?Set MEDIA_HOST_PATH to your local media directory (e.g. /home/you/mpr/media)}"
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+CONFIG_TPL="$SCRIPT_DIR/k8s/kind-config.yaml.tpl"
+
+envsubst < "$CONFIG_TPL" | kind create cluster --config -
+echo "Cluster 'mpr' created with media mount: $MEDIA_HOST_PATH → /mnt/media"
--- a/detect/checkpoint/frames.py
+++ b/detect/checkpoint/frames.py
@@ -13,7 +13,7 @@ from detect.models import Frame

 logger = logging.getLogger(__name__)

-BUCKET = os.environ.get("S3_BUCKET_OUT", "mpr-media-out")
+BUCKET = os.environ.get("S3_BUCKET_OUT", "out")
 CHECKPOINT_PREFIX = "checkpoints"


--- a/requirements.txt
+++ b/requirements.txt
@@ -32,6 +32,14 @@ langfuse>=2.0.0
 # Cloud LLM providers (only needed for cloud escalation stage)
 anthropic>=0.40.0

+# Detection pipeline orchestration
+numpy>=1.24.0
+Pillow>=10.0.0
+imagehash>=4.3.0
+ffmpeg-python>=0.2.0
+langgraph>=0.0.30
+rapidfuzz>=3.0.0
+
 # Testing
 pytest>=7.4.0
 pytest-django>=4.7.0
--- a/tests/detect/manual/push_logs.py
+++ b/tests/detect/manual/push_logs.py
@@ -65,7 +65,7 @@ MESSAGES = {

 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--job", default="manual-test")
+    parser.add_argument("--job", default=f"logs-{int(__import__('time').time()) % 100000}")
    parser.add_argument("--port", type=int, default=6382)
    parser.add_argument("--count", type=int, default=50)
    parser.add_argument("--delay", type=float, default=0.2)
--- a/ui/common/types/generated.ts
+++ b/ui/common/types/generated.ts
@@ -10,6 +10,7 @@ export type ChunkJobStatus = "pending" | "chunking" | "processing" | "collecting
 export type DetectJobStatus = "pending" | "running" | "paused" | "completed" | "failed" | "cancelled";
 export type RunType = "initial" | "replay" | "retry";
 export type BrandSource = "ocr" | "local_vlm" | "cloud_llm" | "manual";
+export type SourceType = "chunk_job" | "upload" | "device" | "stream";

 export interface MediaAsset {
  id: string;
@@ -169,6 +170,19 @@ export interface SourceBrandSighting {
  created_at: string | null;
 }

+export interface SourceJob {
+  job_id: string;
+  source_type: string;
+  chunk_count: number;
+  total_bytes: number;
+}
+
+export interface ChunkInfo {
+  filename: string;
+  key: string;
+  size_bytes: number;
+}
+
 export interface CreateJobRequest {
  source_asset_id: string;
  preset_id: string | null;
--- a/ui/detection-app/src/panels/SourceSelector.vue
+++ b/ui/detection-app/src/panels/SourceSelector.vue
@@ -0,0 +1,386 @@
+<script setup lang="ts">
+import { ref, onMounted } from 'vue'
+import { Panel } from 'mpr-ui-framework'
+import { usePipelineStore } from '../stores/pipeline'
+
+const pipeline = usePipelineStore()
+
+interface ChunkInfo {
+  filename: string
+  key: string
+  size_bytes: number
+}
+
+interface SourceInfo {
+  job_id: string
+  source_type: string
+  chunk_count: number
+  total_bytes: number
+}
+
+const SOURCE_TYPE_LABELS: Record<string, string> = {
+  chunk_job: 'CHUNKS',
+  upload:    'UPLOAD',
+  device:    'DEVICE',
+  stream:    'STREAM',
+}
+
+const sources = ref<SourceInfo[]>([])
+const chunks = ref<ChunkInfo[]>([])
+const selectedSource = ref<string | null>(null)
+const selectedChunk = ref<string | null>(null)
+const loading = ref(false)
+const running = ref(false)
+const skipVlm = ref(false)
+const skipCloud = ref(true)
+const checkpoint = ref(true)
+const logLevel = ref('INFO')
+const error = ref<string | null>(null)
+
+async function loadSources() {
+  loading.value = true
+  error.value = null
+  try {
+    const resp = await fetch('/api/detect/sources')
+    if (!resp.ok) throw new Error(`${resp.status} ${resp.statusText}`)
+    sources.value = await resp.json()
+  } catch (e: any) {
+    error.value = `Failed to load sources: ${e.message}`
+  } finally {
+    loading.value = false
+  }
+}
+
+async function loadChunks(jobId: string) {
+  selectedSource.value = jobId
+  selectedChunk.value = null
+  chunks.value = []
+  try {
+    const resp = await fetch(`/api/detect/sources/${jobId}/chunks`)
+    if (!resp.ok) throw new Error(`${resp.status} ${resp.statusText}`)
+    chunks.value = await resp.json()
+  } catch (e: any) {
+    error.value = `Failed to load chunks: ${e.message}`
+  }
+}
+
+function selectChunk(chunk: ChunkInfo) {
+  selectedChunk.value = chunk.key
+}
+
+async function openPreview(chunk: ChunkInfo) {
+  if (!selectedSource.value) return
+  try {
+    const resp = await fetch(
+      `/api/detect/sources/${selectedSource.value}/chunks/${encodeURIComponent(chunk.filename)}/url`
+    )
+    if (!resp.ok) throw new Error(`${resp.status}`)
+    const data = await resp.json()
+    window.open(data.url, '_blank')
+  } catch (e: any) {
+    error.value = `Could not get preview URL: ${e.message}`
+  }
+}
+
+const emit = defineEmits<{
+  (e: 'job-started', jobId: string): void
+}>()
+
+async function runPipeline() {
+  if (!selectedChunk.value) return
+  running.value = true
+  error.value = null
+
+  try {
+    const resp = await fetch('/api/detect/run', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        video_path: selectedChunk.value,
+        checkpoint: checkpoint.value,
+        skip_vlm: skipVlm.value,
+        skip_cloud: skipCloud.value,
+        log_level: logLevel.value,
+      }),
+    })
+    if (!resp.ok) {
+      const detail = await resp.text()
+      throw new Error(`${resp.status}: ${detail}`)
+    }
+
+    const data = await resp.json()
+    emit('job-started', data.job_id)
+  } catch (e: any) {
+    error.value = `Failed to start pipeline: ${e.message}`
+    running.value = false
+  }
+}
+
+function formatSize(bytes: number): string {
+  if (bytes < 1024) return `${bytes}B`
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)}KB`
+  if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)}MB`
+  return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)}GB`
+}
+
+function sourceTypeLabel(sourceType: string): string {
+  return SOURCE_TYPE_LABELS[sourceType] ?? sourceType.toUpperCase()
+}
+
+onMounted(loadSources)
+</script>
+
+<template>
+  <Panel title="Select Source">
+    <div class="source-selector">
+
+      <div v-if="error" class="source-error">{{ error }}</div>
+
+      <!-- Source list -->
+      <div class="source-section">
+        <h3>Chunk Jobs</h3>
+        <div class="source-list" v-if="!loading">
+          <div
+            v-for="src in sources"
+            :key="src.job_id"
+            :class="['source-item', { selected: selectedSource === src.job_id }]"
+            @click="loadChunks(src.job_id)"
+          >
+            <span class="source-id">{{ src.job_id.slice(0, 12) }}</span>
+            <span class="source-meta">
+              <span class="source-type-badge">{{ sourceTypeLabel(src.source_type) }}</span>
+              <span class="source-count">{{ src.chunk_count }} chunks</span>
+              <span class="source-size">{{ formatSize(src.total_bytes) }}</span>
+            </span>
+          </div>
+          <div v-if="sources.length === 0" class="source-empty">No sources found</div>
+        </div>
+        <div v-else class="source-empty">Loading...</div>
+      </div>
+
+      <!-- Chunk list -->
+      <div class="source-section" v-if="chunks.length > 0">
+        <h3>Chunks</h3>
+        <div class="chunk-list">
+          <div
+            v-for="chunk in chunks"
+            :key="chunk.key"
+            :class="['chunk-item', { selected: selectedChunk === chunk.key }]"
+            @click="selectChunk(chunk)"
+          >
+            <span class="chunk-name">{{ chunk.filename }}</span>
+            <span class="chunk-meta">
+              <span class="chunk-size">{{ formatSize(chunk.size_bytes) }}</span>
+              <button
+                class="preview-btn"
+                @click.stop="openPreview(chunk)"
+                title="Open preview"
+              >▶</button>
+            </span>
+          </div>
+        </div>
+      </div>
+
+      <!-- Run options -->
+      <div class="run-options" v-if="selectedChunk">
+        <h3>Run Options</h3>
+        <label><input type="checkbox" v-model="checkpoint"> Checkpointing</label>
+        <label><input type="checkbox" v-model="skipVlm"> Skip VLM</label>
+        <label><input type="checkbox" v-model="skipCloud"> Skip Cloud</label>
+        <label>
+          Log level
+          <select v-model="logLevel" class="log-level-select">
+            <option value="INFO">INFO</option>
+            <option value="DEBUG">DEBUG</option>
+          </select>
+        </label>
+
+        <div class="selected-path">{{ selectedChunk }}</div>
+
+        <button class="run-btn" @click="runPipeline" :disabled="running">
+          {{ running ? 'Starting...' : 'Run Pipeline' }}
+        </button>
+      </div>
+
+      <div class="source-actions">
+        <button class="editor-close" @click="pipeline.closeEditor()">✕ Close</button>
+      </div>
+    </div>
+  </Panel>
+</template>
+
+<style scoped>
+.source-selector {
+  display: flex;
+  flex-direction: column;
+  height: 100%;
+  gap: var(--space-3);
+  padding: var(--space-2);
+}
+
+.source-error {
+  color: var(--status-error);
+  font-size: var(--font-size-sm);
+  padding: var(--space-2);
+  background: rgba(224, 82, 82, 0.1);
+  border-radius: 4px;
+}
+
+.source-section {
+  display: flex;
+  flex-direction: column;
+  gap: var(--space-1);
+}
+
+.source-section h3 {
+  font-size: var(--font-size-sm);
+  color: var(--text-dim);
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+}
+
+.source-list, .chunk-list {
+  max-height: 200px;
+  overflow-y: auto;
+  background: var(--surface-2);
+  border-radius: var(--panel-radius);
+  padding: var(--space-1);
+}
+
+.source-item, .chunk-item {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: var(--space-1) var(--space-2);
+  border-radius: 3px;
+  cursor: pointer;
+  font-size: var(--font-size-sm);
+  color: var(--text-secondary);
+}
+
+.source-item:hover, .chunk-item:hover {
+  background: var(--surface-3);
+}
+
+.source-item.selected, .chunk-item.selected {
+  background: var(--surface-3);
+  color: var(--text-primary);
+  font-weight: 600;
+}
+
+.source-id { font-family: var(--font-mono); }
+
+.source-meta {
+  display: flex;
+  align-items: center;
+  gap: var(--space-2);
+}
+
+.source-type-badge {
+  font-family: var(--font-mono);
+  font-size: 10px;
+  font-weight: 700;
+  color: var(--status-live);
+  background: rgba(0, 255, 128, 0.1);
+  border-radius: 2px;
+  padding: 1px 4px;
+}
+
+.source-count, .chunk-size, .source-size { color: var(--text-dim); font-size: 11px; }
+
+.log-level-select {
+  background: var(--surface-2);
+  border: 1px solid var(--surface-3);
+  border-radius: 3px;
+  color: var(--text-secondary);
+  font-family: var(--font-mono);
+  font-size: var(--font-size-sm);
+  padding: 2px 4px;
+  margin-left: var(--space-2);
+}
+.source-empty { color: var(--text-dim); text-align: center; padding: var(--space-3); font-size: var(--font-size-sm); }
+
+.chunk-meta {
+  display: flex;
+  align-items: center;
+  gap: var(--space-2);
+}
+
+.preview-btn {
+  background: none;
+  border: 1px solid var(--surface-3);
+  border-radius: 3px;
+  color: var(--text-dim);
+  font-size: 10px;
+  padding: 1px 5px;
+  cursor: pointer;
+  line-height: 1;
+}
+
+.preview-btn:hover {
+  background: var(--surface-3);
+  color: var(--text-primary);
+}
+
+.run-options {
+  display: flex;
+  flex-direction: column;
+  gap: var(--space-2);
+  font-size: var(--font-size-sm);
+  color: var(--text-secondary);
+}
+
+.run-options label {
+  display: flex;
+  align-items: center;
+  gap: var(--space-2);
+  cursor: pointer;
+}
+
+.selected-path {
+  font-family: var(--font-mono);
+  font-size: 10px;
+  color: var(--text-dim);
+  padding: var(--space-1) var(--space-2);
+  background: var(--surface-2);
+  border-radius: 3px;
+  word-break: break-all;
+}
+
+.run-btn {
+  background: var(--status-live);
+  color: #000;
+  border: none;
+  border-radius: 4px;
+  padding: var(--space-2) var(--space-3);
+  font-family: var(--font-mono);
+  font-size: var(--font-size-sm);
+  font-weight: 600;
+  cursor: pointer;
+}
+
+.run-btn:hover { opacity: 0.9; }
+.run-btn:disabled { opacity: 0.5; cursor: not-allowed; }
+
+.source-actions {
+  flex-shrink: 0;
+  display: flex;
+  justify-content: flex-end;
+  margin-top: auto;
+}
+
+.editor-close {
+  background: var(--surface-3);
+  border: 1px solid var(--surface-3);
+  border-radius: 4px;
+  padding: var(--space-2) var(--space-3);
+  color: var(--text-secondary);
+  font-family: var(--font-mono);
+  font-size: var(--font-size-sm);
+  cursor: pointer;
+}
+.editor-close:hover {
+  background: var(--status-error);
+  color: #000;
+}
+</style>