chunker and ui

This commit is contained in:
2026-03-13 14:29:38 -03:00
parent 3eeedebb15
commit ccc478fbaa
69 changed files with 6481 additions and 282 deletions

30
.dockerignore Normal file
View File

@@ -0,0 +1,30 @@
# Python
.venv/
__pycache__/
*.pyc
*.egg-info/
.pytest_cache/
# Node
node_modules/
ui/*/node_modules/
ui/*/dist/
# Media (9.8GB — mounted via volume, never needed in image)
media/
# Git
.git/
# IDE / OS
.idea/
.vscode/
*.swp
.DS_Store
# Docker
ctrl/docker-compose.yml
# Docs
docs/
*.md

View File

@@ -7,4 +7,4 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "admin.mpr.settings")
app = Celery("mpr") app = Celery("mpr")
app.config_from_object("django.conf:settings", namespace="CELERY") app.config_from_object("django.conf:settings", namespace="CELERY")
app.autodiscover_tasks() app.autodiscover_tasks()
app.autodiscover_tasks(["core.task"]) app.autodiscover_tasks(["core.jobs"])

View File

@@ -19,6 +19,15 @@ class JobStatus(models.TextChoices):
FAILED = "failed", "Failed" FAILED = "failed", "Failed"
CANCELLED = "cancelled", "Cancelled" CANCELLED = "cancelled", "Cancelled"
class ChunkJobStatus(models.TextChoices):
PENDING = "pending", "Pending"
CHUNKING = "chunking", "Chunking"
PROCESSING = "processing", "Processing"
COLLECTING = "collecting", "Collecting"
COMPLETED = "completed", "Completed"
FAILED = "failed", "Failed"
CANCELLED = "cancelled", "Cancelled"
class MediaAsset(models.Model): class MediaAsset(models.Model):
"""A video/audio file registered in the system.""" """A video/audio file registered in the system."""
@@ -108,3 +117,34 @@ class TranscodeJob(models.Model):
def __str__(self): def __str__(self):
return str(self.id) return str(self.id)
class ChunkJob(models.Model):
"""A chunk pipeline job — splits a media file into chunks and processes them"""
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
source_asset_id = models.UUIDField()
chunk_duration = models.FloatField(default=10.0)
num_workers = models.IntegerField(default=4)
max_retries = models.IntegerField(default=3)
processor_type = models.CharField(max_length=255)
status = models.CharField(max_length=20, choices=ChunkJobStatus.choices, default=ChunkJobStatus.PENDING)
progress = models.FloatField(default=0.0)
total_chunks = models.IntegerField(default=0)
processed_chunks = models.IntegerField(default=0)
failed_chunks = models.IntegerField(default=0)
retry_count = models.IntegerField(default=0)
error_message = models.TextField(blank=True, default='')
throughput_mbps = models.FloatField(null=True, blank=True, default=None)
elapsed_seconds = models.FloatField(null=True, blank=True, default=None)
celery_task_id = models.CharField(max_length=255, null=True, blank=True)
priority = models.IntegerField(default=0)
created_at = models.DateTimeField(auto_now_add=True)
started_at = models.DateTimeField(null=True, blank=True)
completed_at = models.DateTimeField(null=True, blank=True)
class Meta:
ordering = ["-created_at"]
def __str__(self):
return str(self.id)

78
core/api/chunker_sse.py Normal file
View File

@@ -0,0 +1,78 @@
"""
SSE endpoint for chunker pipeline events.
Bridges gRPC StreamProgress to browser-native EventSource.
GET /api/chunker/stream/{job_id} → text/event-stream
"""
import asyncio
import json
import logging
import time
from typing import AsyncGenerator
from fastapi import APIRouter
from starlette.responses import StreamingResponse
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/chunker", tags=["chunker"])
async def _event_generator(job_id: str) -> AsyncGenerator[str, None]:
"""
Generate SSE events by polling gRPC job state.
Yields server-sent events in the format:
event: <event_type>
data: <json_payload>
"""
from core.rpc.server import _active_jobs
last_state = None
timeout = time.monotonic() + 600 # 10 min max
while time.monotonic() < timeout:
job_state = _active_jobs.get(job_id)
if job_state is None:
# Job not found yet — may not have started
yield f"event: waiting\ndata: {json.dumps({'job_id': job_id})}\n\n"
await asyncio.sleep(0.5)
continue
# Only send if state changed
if job_state != last_state:
last_state = dict(job_state)
event_type = job_state.get("status", "update")
yield f"event: {event_type}\ndata: {json.dumps({**job_state, 'job_id': job_id})}\n\n"
# End stream when job is terminal
if event_type in ("completed", "failed", "cancelled"):
yield f"event: done\ndata: {json.dumps({'job_id': job_id})}\n\n"
break
await asyncio.sleep(0.2)
yield f"event: timeout\ndata: {json.dumps({'job_id': job_id})}\n\n"
@router.get("/stream/{job_id}")
async def stream_chunk_job(job_id: str):
"""
SSE stream for a chunk pipeline job.
The UI connects via native EventSource:
const es = new EventSource('/api/chunker/stream/<job_id>');
es.addEventListener('processing', (e) => { ... });
"""
return StreamingResponse(
_event_generator(job_id),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)

View File

@@ -15,6 +15,8 @@ from strawberry.schema.config import StrawberryConfig
from strawberry.types import Info from strawberry.types import Info
from core.api.schema.graphql import ( from core.api.schema.graphql import (
ChunkJobType,
CreateChunkJobInput,
CreateJobInput, CreateJobInput,
DeleteResultType, DeleteResultType,
MediaAssetType, MediaAssetType,
@@ -172,30 +174,31 @@ class Mutation:
priority=input.priority or 0, priority=input.priority or 0,
) )
payload = {
"source_key": source.file_path,
"output_key": output_filename,
"preset": preset_snapshot or None,
"trim_start": input.trim_start,
"trim_end": input.trim_end,
"duration": source.duration,
}
executor_mode = os.environ.get("MPR_EXECUTOR", "local") executor_mode = os.environ.get("MPR_EXECUTOR", "local")
if executor_mode in ("lambda", "gcp"): if executor_mode in ("lambda", "gcp"):
from core.task.executor import get_executor from core.jobs.executor import get_executor
get_executor().run( get_executor().run(
job_type="transcode",
job_id=str(job.id), job_id=str(job.id),
source_path=source.file_path, payload=payload,
output_path=output_filename,
preset=preset_snapshot or None,
trim_start=input.trim_start,
trim_end=input.trim_end,
duration=source.duration,
) )
else: else:
from core.task.tasks import run_transcode_job from core.jobs.task import run_job
result = run_transcode_job.delay( result = run_job.delay(
job_type="transcode",
job_id=str(job.id), job_id=str(job.id),
source_key=source.file_path, payload=payload,
output_key=output_filename,
preset=preset_snapshot or None,
trim_start=input.trim_start,
trim_end=input.trim_end,
duration=source.duration,
) )
job.celery_task_id = result.id job.celery_task_id = result.id
job.save(update_fields=["celery_task_id"]) job.save(update_fields=["celery_task_id"])
@@ -261,6 +264,62 @@ class Mutation:
except Exception: except Exception:
raise Exception("Asset not found") raise Exception("Asset not found")
@strawberry.mutation
def create_chunk_job(self, info: Info, input: CreateChunkJobInput) -> ChunkJobType:
"""Create and dispatch a chunk pipeline job."""
import uuid
from core.db import get_asset
try:
source = get_asset(input.source_asset_id)
except Exception:
raise Exception("Source asset not found")
job_id = str(uuid.uuid4())
payload = {
"source_key": source.file_path,
"chunk_duration": input.chunk_duration,
"num_workers": input.num_workers,
"max_retries": input.max_retries,
"processor_type": input.processor_type,
}
executor_mode = os.environ.get("MPR_EXECUTOR", "local")
celery_task_id = None
if executor_mode in ("lambda", "gcp"):
from core.jobs.executor import get_executor
get_executor().run(
job_type="chunk",
job_id=job_id,
payload=payload,
)
else:
from core.jobs.task import run_job
result = run_job.delay(
job_type="chunk",
job_id=job_id,
payload=payload,
)
celery_task_id = result.id
return ChunkJobType(
id=uuid.UUID(job_id),
source_asset_id=input.source_asset_id,
chunk_duration=input.chunk_duration,
num_workers=input.num_workers,
max_retries=input.max_retries,
processor_type=input.processor_type,
status="pending",
progress=0.0,
priority=input.priority,
celery_task_id=celery_task_id,
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Schema # Schema

View File

@@ -23,6 +23,7 @@ from fastapi import FastAPI, Header, HTTPException
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from strawberry.fastapi import GraphQLRouter from strawberry.fastapi import GraphQLRouter
from core.api.chunker_sse import router as chunker_router
from core.api.graphql import schema as graphql_schema from core.api.graphql import schema as graphql_schema
CALLBACK_API_KEY = os.environ.get("CALLBACK_API_KEY", "") CALLBACK_API_KEY = os.environ.get("CALLBACK_API_KEY", "")
@@ -48,6 +49,9 @@ app.add_middleware(
graphql_router = GraphQLRouter(schema=graphql_schema, graphql_ide="graphiql") graphql_router = GraphQLRouter(schema=graphql_schema, graphql_ide="graphiql")
app.include_router(graphql_router, prefix="/graphql") app.include_router(graphql_router, prefix="/graphql")
# Chunker SSE
app.include_router(chunker_router)
@app.get("/") @app.get("/")
def root(): def root():

View File

@@ -156,3 +156,52 @@ class WorkerStatusType:
active_jobs: Optional[int] = None active_jobs: Optional[int] = None
supported_codecs: Optional[List[str]] = None supported_codecs: Optional[List[str]] = None
gpu_available: Optional[bool] = None gpu_available: Optional[bool] = None
@strawberry.enum
class ChunkJobStatus(Enum):
PENDING = "pending"
CHUNKING = "chunking"
PROCESSING = "processing"
COLLECTING = "collecting"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
@strawberry.type
class ChunkJobType:
"""A chunk pipeline job."""
id: Optional[UUID] = None
source_asset_id: Optional[UUID] = None
chunk_duration: Optional[float] = None
num_workers: Optional[int] = None
max_retries: Optional[int] = None
processor_type: Optional[str] = None
status: Optional[str] = None
progress: Optional[float] = None
total_chunks: Optional[int] = None
processed_chunks: Optional[int] = None
failed_chunks: Optional[int] = None
retry_count: Optional[int] = None
error_message: Optional[str] = None
throughput_mbps: Optional[float] = None
elapsed_seconds: Optional[float] = None
celery_task_id: Optional[str] = None
priority: Optional[int] = None
created_at: Optional[datetime] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
@strawberry.input
class CreateChunkJobInput:
"""Request body for creating a chunk pipeline job."""
source_asset_id: UUID
chunk_duration: float = 10.0
num_workers: int = 4
max_retries: int = 3
processor_type: str = "ffmpeg"
priority: int = 0

64
core/chunker/__init__.py Normal file
View File

@@ -0,0 +1,64 @@
"""
Chunker pipeline — splits files into chunks, processes concurrently, reassembles in order.
Public API:
Pipeline — orchestrates the full pipeline
PipelineResult — aggregate result dataclass
Chunker — file → Chunk generator
ChunkQueue — bounded thread-safe queue
WorkerPool — manages N worker threads
ResultCollector — heapq-based ordered reassembly
"""
from .chunker import Chunker
from .collector import ResultCollector
from .exceptions import (
ChunkChecksumError,
ChunkError,
ChunkReadError,
PipelineError,
ProcessingError,
ProcessorFailureError,
ProcessorTimeoutError,
ReassemblyError,
)
from .models import Chunk, ChunkResult, PipelineResult
from .pipeline import Pipeline
from .pool import WorkerPool
from .processor import (
ChecksumProcessor,
CompositeProcessor,
FFmpegExtractProcessor,
Processor,
SimulatedDecodeProcessor,
)
from .queue import ChunkQueue
__all__ = [
# Core
"Pipeline",
"PipelineResult",
# Components
"Chunker",
"ChunkQueue",
"WorkerPool",
"ResultCollector",
# Models
"Chunk",
"ChunkResult",
# Processors
"Processor",
"ChecksumProcessor",
"SimulatedDecodeProcessor",
"CompositeProcessor",
"FFmpegExtractProcessor",
# Exceptions
"PipelineError",
"ChunkError",
"ChunkReadError",
"ChunkChecksumError",
"ProcessingError",
"ProcessorFailureError",
"ProcessorTimeoutError",
"ReassemblyError",
]

86
core/chunker/chunker.py Normal file
View File

@@ -0,0 +1,86 @@
"""
Chunker — probes a media file and yields time-based Chunk objects.
Demonstrates:
- Function parameters and defaults (Interview Topic 1)
- List comprehensions and efficient iteration / generators (Interview Topic 3)
"""
import math
import os
from typing import Generator
from core.ffmpeg.probe import probe_file
from .exceptions import ChunkReadError
from .models import Chunk
class Chunker:
"""
Splits a media file into time-based chunks via a generator.
Uses FFmpeg probe to get duration, then yields Chunk objects
representing time segments (no data read — extraction happens in the processor).
Args:
file_path: Path to the source media file
chunk_duration: Duration of each chunk in seconds (default: 10.0)
"""
def __init__(self, file_path: str, chunk_duration: float = 10.0):
if not os.path.isfile(file_path):
raise ChunkReadError(f"File not found: {file_path}")
if chunk_duration <= 0:
raise ValueError("chunk_duration must be positive")
self.file_path = file_path
self.chunk_duration = chunk_duration
self.file_size = os.path.getsize(file_path)
self.source_duration = self._probe_duration()
def _probe_duration(self) -> float:
"""Get source file duration via FFmpeg probe."""
try:
result = probe_file(self.file_path)
if result.duration is None or result.duration <= 0:
raise ChunkReadError(
f"Cannot determine duration for {self.file_path}"
)
return result.duration
except ChunkReadError:
raise
except Exception as e:
raise ChunkReadError(
f"Failed to probe {self.file_path}: {e}"
) from e
@property
def expected_chunks(self) -> int:
"""Calculate expected number of chunks (last chunk may be shorter)."""
if self.source_duration <= 0:
return 0
return math.ceil(self.source_duration / self.chunk_duration)
def chunks(self) -> Generator[Chunk, None, None]:
"""
Yield Chunk objects representing time segments of the source file.
Generator-based: chunks are yielded on demand.
Each chunk defines a time range — actual extraction is done by the processor.
"""
total = self.expected_chunks
for sequence in range(total):
start_time = sequence * self.chunk_duration
end_time = min(
start_time + self.chunk_duration, self.source_duration
)
duration = end_time - start_time
yield Chunk(
sequence=sequence,
start_time=start_time,
end_time=end_time,
source_path=self.file_path,
duration=duration,
)

98
core/chunker/collector.py Normal file
View File

@@ -0,0 +1,98 @@
"""
ResultCollector — reassembles chunk results in sequence order using a min-heap.
Demonstrates:
- Algorithms and sorting (Interview Topic 6) — heapq for ordered reassembly
- Core data structures (Interview Topic 5) — heap, deque
"""
import heapq
from collections import deque
from typing import List
from .exceptions import ReassemblyError
from .models import ChunkResult
class ResultCollector:
"""
Receives ChunkResults out of order, emits them in sequence order.
Uses a min-heap keyed on sequence number. Only emits a chunk when
all prior sequences have been accounted for.
Args:
total_chunks: Expected total number of chunks
"""
def __init__(self, total_chunks: int):
self.total_chunks = total_chunks
self._heap: List[tuple[int, ChunkResult]] = []
self._next_sequence = 0
self._emitted: List[ChunkResult] = []
self._seen_sequences: set[int] = set()
# Sliding window for throughput calculation
self._recent_times: deque[float] = deque(maxlen=50)
def add(self, result: ChunkResult) -> List[ChunkResult]:
"""
Add a result and return any newly emittable results in order.
Args:
result: A ChunkResult (may arrive out of order)
Returns:
List of results that can now be emitted in sequence order
(may be empty if we're still waiting for earlier sequences)
Raises:
ReassemblyError: If a duplicate sequence is received
"""
if result.sequence in self._seen_sequences:
raise ReassemblyError(
f"Duplicate sequence number: {result.sequence}"
)
self._seen_sequences.add(result.sequence)
# Track processing time for throughput
if result.processing_time > 0:
self._recent_times.append(result.processing_time)
# Push to min-heap
heapq.heappush(self._heap, (result.sequence, result))
# Emit all consecutive results starting from _next_sequence
newly_emitted = []
while self._heap and self._heap[0][0] == self._next_sequence:
_, emitted_result = heapq.heappop(self._heap)
self._emitted.append(emitted_result)
newly_emitted.append(emitted_result)
self._next_sequence += 1
return newly_emitted
@property
def is_complete(self) -> bool:
"""True if all expected chunks have been emitted in order."""
return self._next_sequence == self.total_chunks
@property
def buffered_count(self) -> int:
"""Number of results waiting in the heap (arrived out of order)."""
return len(self._heap)
@property
def emitted_count(self) -> int:
"""Number of results emitted in sequence order."""
return len(self._emitted)
@property
def avg_processing_time(self) -> float:
"""Average processing time from recent results (sliding window)."""
if not self._recent_times:
return 0.0
return sum(self._recent_times) / len(self._recent_times)
def get_ordered_results(self) -> List[ChunkResult]:
"""Get all emitted results in sequence order."""
return list(self._emitted)

View File

@@ -0,0 +1,64 @@
"""
Chunker exception hierarchy.
Demonstrates: Managing exceptions and writing resilient code (Interview Topic 7).
"""
class PipelineError(Exception):
"""Base exception for all chunker pipeline errors."""
pass
class ChunkError(PipelineError):
"""Errors related to chunk creation or validation."""
pass
class ChunkReadError(ChunkError):
"""Failed to read chunk data from source file."""
pass
class ChunkChecksumError(ChunkError):
"""Chunk data integrity validation failed."""
def __init__(self, sequence: int, expected: str, actual: str):
self.sequence = sequence
self.expected = expected
self.actual = actual
super().__init__(
f"Chunk {sequence}: checksum mismatch "
f"(expected={expected}, actual={actual})"
)
class ProcessingError(PipelineError):
"""Errors during chunk processing by workers."""
pass
class ProcessorTimeoutError(ProcessingError):
"""Processor exceeded allowed time for a chunk."""
def __init__(self, sequence: int, timeout: float):
self.sequence = sequence
self.timeout = timeout
super().__init__(f"Chunk {sequence}: processor timed out after {timeout}s")
class ProcessorFailureError(ProcessingError):
"""Processor failed to process a chunk after all retries."""
def __init__(self, sequence: int, retries: int, original_error: Exception):
self.sequence = sequence
self.retries = retries
self.original_error = original_error
super().__init__(
f"Chunk {sequence}: failed after {retries} retries — {original_error}"
)
class ReassemblyError(PipelineError):
"""Errors during result collection and ordering."""
pass

54
core/chunker/models.py Normal file
View File

@@ -0,0 +1,54 @@
"""
Internal data models for the chunker pipeline.
These are pipeline-internal dataclasses, not schema models.
Schema-level ChunkJob is in core/schema/models/jobs.py.
Demonstrates: Core data structures (Interview Topic 5).
"""
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
@dataclass
class Chunk:
"""A time-based segment of the source media file."""
sequence: int
start_time: float # seconds
end_time: float # seconds
source_path: str # path to source file
duration: float # end_time - start_time
checksum: str = "" # computed after extraction
@dataclass
class ChunkResult:
"""Result of processing a single chunk."""
sequence: int
success: bool
checksum_valid: bool = True
processing_time: float = 0.0
error: Optional[str] = None
retries: int = 0
worker_id: Optional[str] = None
output_file: Optional[str] = None
@dataclass
class PipelineResult:
"""Aggregate result of the entire pipeline run."""
total_chunks: int = 0
processed: int = 0
failed: int = 0
retries: int = 0
elapsed_time: float = 0.0
throughput_mbps: float = 0.0
worker_stats: Dict[str, Any] = field(default_factory=dict)
errors: List[str] = field(default_factory=list)
chunks_in_order: bool = True
output_dir: Optional[str] = None
chunk_files: List[str] = field(default_factory=list)

244
core/chunker/pipeline.py Normal file
View File

@@ -0,0 +1,244 @@
"""
Pipeline — orchestrates the entire chunker pipeline.
Wires: Chunker → ChunkQueue → WorkerPool → ResultCollector → PipelineResult
Demonstrates:
- Function parameters and defaults (Interview Topic 1) — configurable pipeline
- Concurrency (Interview Topic 2) — producer thread + worker pool
- OOP design (Interview Topic 4) — composition of pipeline components
- Exception handling (Interview Topic 7) — graceful error propagation
"""
import json
import logging
import threading
import time
from pathlib import Path
from typing import Any, Callable, Dict, Optional
from .chunker import Chunker
from .collector import ResultCollector
from .exceptions import PipelineError
from .models import PipelineResult
from .pool import WorkerPool
from .queue import ChunkQueue
logger = logging.getLogger(__name__)
class Pipeline:
"""
Orchestrates the chunk processing pipeline.
The pipeline runs in three stages:
1. Producer thread: Chunker probes file → pushes time-based chunks to ChunkQueue
2. Worker pool: N workers pull from queue → extract mp4 segments → emit results
3. Collector: ResultCollector reassembles results in sequence order
Args:
source: Path to the source media file
chunk_duration: Duration of each chunk in seconds (default: 10.0)
num_workers: Number of concurrent worker threads (default: 4)
max_retries: Max retry attempts per chunk (default: 3)
processor_type: Processor to use — "ffmpeg", "checksum", "simulated_decode", "composite"
queue_size: Max chunks buffered in queue (default: 10)
event_callback: Optional callback for real-time events
output_dir: Directory for output chunk files (required for "ffmpeg" processor)
"""
def __init__(
self,
source: str,
chunk_duration: float = 10.0,
num_workers: int = 4,
max_retries: int = 3,
processor_type: str = "checksum",
queue_size: int = 10,
event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
output_dir: Optional[str] = None,
):
self.source = source
self.chunk_duration = chunk_duration
self.num_workers = num_workers
self.max_retries = max_retries
self.processor_type = processor_type
self.queue_size = queue_size
self.event_callback = event_callback
self.output_dir = output_dir
def _emit(self, event_type: str, data: Dict[str, Any]) -> None:
"""Emit an event if callback is registered."""
if self.event_callback:
self.event_callback(event_type, data)
def _produce_chunks(
self, chunker: Chunker, chunk_queue: ChunkQueue
) -> None:
"""Producer thread: probe file and enqueue time-based chunks."""
try:
for chunk in chunker.chunks():
chunk_queue.put(chunk, timeout=30.0)
self._emit("chunk_queued", {
"sequence": chunk.sequence,
"start_time": chunk.start_time,
"end_time": chunk.end_time,
"duration": chunk.duration,
"queue_size": chunk_queue.qsize(),
})
except Exception as e:
logger.error(f"Producer error: {e}")
self._emit("producer_error", {"error": str(e)})
finally:
chunk_queue.close()
def _write_manifest(
self, result: PipelineResult, source_duration: float
) -> None:
"""Write manifest.json to output_dir with segment metadata."""
if not self.output_dir:
return
manifest = {
"source": self.source,
"source_duration": source_duration,
"chunk_duration": self.chunk_duration,
"total_chunks": result.total_chunks,
"processed": result.processed,
"failed": result.failed,
"elapsed_time": result.elapsed_time,
"throughput_mbps": result.throughput_mbps,
"segments": [
{
"sequence": i,
"file": f"chunk_{i:04d}.mp4",
"start": i * self.chunk_duration,
"end": min(
(i + 1) * self.chunk_duration, source_duration
),
}
for i in range(result.total_chunks)
if i < result.total_chunks
],
}
manifest_path = Path(self.output_dir) / "manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))
logger.info(f"Manifest written to {manifest_path}")
def run(self) -> PipelineResult:
"""
Execute the full pipeline.
Returns:
PipelineResult with aggregate stats
Raises:
PipelineError: If the pipeline fails catastrophically
"""
start_time = time.monotonic()
self._emit("pipeline_start", {
"source": self.source,
"chunk_duration": self.chunk_duration,
"num_workers": self.num_workers,
"processor_type": self.processor_type,
})
try:
# Stage 1: Set up chunker (probes file for duration)
chunker = Chunker(self.source, self.chunk_duration)
total_chunks = chunker.expected_chunks
if total_chunks == 0:
self._emit("pipeline_complete", {"total_chunks": 0})
return PipelineResult(chunks_in_order=True)
self._emit("pipeline_info", {
"file_size": chunker.file_size,
"source_duration": chunker.source_duration,
"total_chunks": total_chunks,
})
# Stage 2: Set up queue and worker pool
chunk_queue = ChunkQueue(maxsize=self.queue_size)
pool = WorkerPool(
num_workers=self.num_workers,
chunk_queue=chunk_queue,
processor_type=self.processor_type,
max_retries=self.max_retries,
event_callback=self.event_callback,
output_dir=self.output_dir,
)
# Stage 3: Start workers, then produce chunks
pool.start()
producer = threading.Thread(
target=self._produce_chunks,
args=(chunker, chunk_queue),
name="chunk-producer",
daemon=True,
)
producer.start()
# Stage 4: Wait for all workers to finish
all_results = pool.wait()
producer.join(timeout=5.0)
# Stage 5: Collect results in order
collector = ResultCollector(total_chunks)
for r in all_results:
collector.add(r)
self._emit("chunk_collected", {
"sequence": r.sequence,
"success": r.success,
"buffered": collector.buffered_count,
"emitted": collector.emitted_count,
})
# Build result
elapsed = time.monotonic() - start_time
file_size_mb = chunker.file_size / (1024 * 1024)
throughput = file_size_mb / elapsed if elapsed > 0 else 0.0
failed_results = [r for r in all_results if not r.success]
total_retries = sum(r.retries for r in all_results)
chunk_files = [
r.output_file for r in all_results
if r.success and r.output_file
]
result = PipelineResult(
total_chunks=total_chunks,
processed=len(all_results),
failed=len(failed_results),
retries=total_retries,
elapsed_time=elapsed,
throughput_mbps=throughput,
worker_stats=pool.get_worker_stats(),
errors=[r.error for r in failed_results if r.error],
chunks_in_order=collector.is_complete,
output_dir=self.output_dir,
chunk_files=chunk_files,
)
# Write manifest if output_dir is set
self._write_manifest(result, chunker.source_duration)
pool.shutdown()
self._emit("pipeline_complete", {
"total_chunks": result.total_chunks,
"processed": result.processed,
"failed": result.failed,
"elapsed": result.elapsed_time,
"throughput_mbps": result.throughput_mbps,
})
return result
except PipelineError:
raise
except Exception as e:
self._emit("pipeline_error", {"error": str(e)})
raise PipelineError(f"Pipeline failed: {e}") from e

125
core/chunker/pool.py Normal file
View File

@@ -0,0 +1,125 @@
"""
WorkerPool — manages N worker threads via ThreadPoolExecutor.
Demonstrates: Python concurrency — threading (Interview Topic 2).
"""
import logging
import threading
from concurrent.futures import Future, ThreadPoolExecutor
from typing import Any, Callable, Dict, List, Optional
from .models import ChunkResult
from .processor import (
ChecksumProcessor,
CompositeProcessor,
FFmpegExtractProcessor,
Processor,
SimulatedDecodeProcessor,
)
from .queue import ChunkQueue
from .worker import Worker
logger = logging.getLogger(__name__)
def create_processor(
processor_type: str = "checksum",
output_dir: Optional[str] = None,
) -> Processor:
"""Factory for processor instances."""
if processor_type == "ffmpeg":
if not output_dir:
raise ValueError("output_dir required for ffmpeg processor")
return FFmpegExtractProcessor(output_dir=output_dir)
elif processor_type == "checksum":
return ChecksumProcessor()
elif processor_type == "simulated_decode":
return SimulatedDecodeProcessor()
elif processor_type == "composite":
return CompositeProcessor([
ChecksumProcessor(),
SimulatedDecodeProcessor(ms_per_second=50.0),
])
else:
raise ValueError(f"Unknown processor type: {processor_type}")
class WorkerPool:
"""
Manages N worker threads that process chunks concurrently.
Args:
num_workers: Number of concurrent worker threads (default: 4)
chunk_queue: Shared queue to pull chunks from
processor_type: Type of processor for each worker (default: "checksum")
max_retries: Max retry attempts per chunk (default: 3)
event_callback: Optional callback for real-time events
"""
def __init__(
self,
num_workers: int = 4,
chunk_queue: Optional[ChunkQueue] = None,
processor_type: str = "checksum",
max_retries: int = 3,
event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
output_dir: Optional[str] = None,
):
self.num_workers = num_workers
self.chunk_queue = chunk_queue or ChunkQueue()
self.processor_type = processor_type
self.max_retries = max_retries
self.event_callback = event_callback
self.output_dir = output_dir
self.shutdown_event = threading.Event()
self._executor: Optional[ThreadPoolExecutor] = None
self._futures: List[Future] = []
self._workers: List[Worker] = []
def start(self) -> None:
"""Start all worker threads."""
self._executor = ThreadPoolExecutor(
max_workers=self.num_workers,
thread_name_prefix="chunk-worker",
)
for i in range(self.num_workers):
worker = Worker(
worker_id=f"worker-{i}",
chunk_queue=self.chunk_queue,
processor=create_processor(self.processor_type, output_dir=self.output_dir),
max_retries=self.max_retries,
event_callback=self.event_callback,
)
self._workers.append(worker)
future = self._executor.submit(worker.run)
self._futures.append(future)
logger.info(f"WorkerPool started with {self.num_workers} workers")
def wait(self) -> List[ChunkResult]:
"""Wait for all workers to finish and collect results."""
all_results = []
for future in self._futures:
results = future.result()
all_results.extend(results)
return all_results
def shutdown(self) -> None:
"""Signal shutdown and cleanup."""
self.shutdown_event.set()
self.chunk_queue.close()
if self._executor:
self._executor.shutdown(wait=True)
def get_worker_stats(self) -> Dict[str, Any]:
"""Get per-worker statistics."""
return {
w.worker_id: {
"processed": w.processed_count,
"errors": w.error_count,
"retries": w.retry_count,
}
for w in self._workers
}

173
core/chunker/processor.py Normal file
View File

@@ -0,0 +1,173 @@
"""
Processor ABC and concrete implementations.
Demonstrates: OOP design principles — ABC, inheritance, composition (Interview Topic 4).
"""
import hashlib
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List
from .exceptions import ChunkChecksumError
from .models import Chunk, ChunkResult
class Processor(ABC):
"""
Abstract base class for chunk processors.
Each processor defines how a single chunk is processed.
The Worker calls processor.process(chunk) and handles retries.
"""
@abstractmethod
def process(self, chunk: Chunk) -> ChunkResult:
"""Process a single chunk and return the result."""
pass
class FFmpegExtractProcessor(Processor):
"""
Extracts a time segment from the source file using FFmpeg stream copy.
Produces a playable mp4 file per chunk — no re-encoding.
Args:
output_dir: Directory to write chunk mp4 files
"""
def __init__(self, output_dir: str):
self.output_dir = output_dir
Path(output_dir).mkdir(parents=True, exist_ok=True)
def process(self, chunk: Chunk) -> ChunkResult:
from core.ffmpeg.transcode import TranscodeConfig, transcode
start = time.monotonic()
output_file = str(
Path(self.output_dir) / f"chunk_{chunk.sequence:04d}.mp4"
)
config = TranscodeConfig(
input_path=chunk.source_path,
output_path=output_file,
video_codec="copy",
audio_codec="copy",
trim_start=chunk.start_time,
trim_end=chunk.end_time,
)
transcode(config)
# Compute checksum of output file
md5 = hashlib.md5()
with open(output_file, "rb") as f:
for block in iter(lambda: f.read(8192), b""):
md5.update(block)
checksum = md5.hexdigest()
elapsed = time.monotonic() - start
return ChunkResult(
sequence=chunk.sequence,
success=True,
checksum_valid=True,
processing_time=elapsed,
output_file=output_file,
)
class ChecksumProcessor(Processor):
"""
Validates chunk metadata consistency.
For time-based chunks, verifies the time range is valid.
Raises ChunkChecksumError on invalid ranges.
"""
def process(self, chunk: Chunk) -> ChunkResult:
start = time.monotonic()
valid = chunk.duration > 0 and chunk.end_time > chunk.start_time
if not valid:
raise ChunkChecksumError(
sequence=chunk.sequence,
expected="valid time range",
actual=f"{chunk.start_time}-{chunk.end_time}",
)
elapsed = time.monotonic() - start
return ChunkResult(
sequence=chunk.sequence,
success=True,
checksum_valid=True,
processing_time=elapsed,
)
class SimulatedDecodeProcessor(Processor):
"""
Simulates decode work by sleeping proportional to chunk duration.
Useful for demonstrating concurrency behavior without real FFmpeg.
Args:
ms_per_second: Milliseconds of simulated work per second of chunk duration (default: 100)
"""
def __init__(self, ms_per_second: float = 100.0):
self.ms_per_second = ms_per_second
def process(self, chunk: Chunk) -> ChunkResult:
start = time.monotonic()
sleep_time = (self.ms_per_second * chunk.duration) / 1000.0
time.sleep(sleep_time)
elapsed = time.monotonic() - start
return ChunkResult(
sequence=chunk.sequence,
success=True,
checksum_valid=True,
processing_time=elapsed,
)
class CompositeProcessor(Processor):
"""
Chains multiple processors — runs each in sequence on the same chunk.
Demonstrates OOP composition pattern.
Args:
processors: List of processors to chain
"""
def __init__(self, processors: List[Processor]):
if not processors:
raise ValueError("CompositeProcessor requires at least one processor")
self.processors = processors
def process(self, chunk: Chunk) -> ChunkResult:
start = time.monotonic()
last_result = None
for proc in self.processors:
last_result = proc.process(chunk)
if not last_result.success:
return last_result
elapsed = time.monotonic() - start
return ChunkResult(
sequence=chunk.sequence,
success=True,
checksum_valid=last_result.checksum_valid if last_result else True,
processing_time=elapsed,
)

76
core/chunker/queue.py Normal file
View File

@@ -0,0 +1,76 @@
"""
ChunkQueue — bounded, thread-safe queue with sentinel-based shutdown.
Demonstrates: Core data structures — queue.Queue (Interview Topic 5).
"""
import queue
from typing import Optional
from .models import Chunk
# Sentinel value to signal workers to stop
_SENTINEL = object()
class ChunkQueue:
"""
Thread-safe bounded queue for chunks.
Provides backpressure: producers block when the queue is full,
preventing unbounded memory usage.
Args:
maxsize: Maximum number of chunks in the queue (default: 10)
"""
def __init__(self, maxsize: int = 10):
self._queue: queue.Queue = queue.Queue(maxsize=maxsize)
self._closed = False
self.maxsize = maxsize
def put(self, chunk: Chunk, timeout: Optional[float] = None) -> None:
"""
Add a chunk to the queue. Blocks if full (backpressure).
Args:
chunk: The chunk to enqueue
timeout: Max seconds to wait (None = block forever)
Raises:
queue.Full: If timeout expires while queue is full
"""
self._queue.put(chunk, timeout=timeout)
def get(self, timeout: Optional[float] = None) -> Optional[Chunk]:
"""
Get next chunk from queue. Returns None if queue is closed.
Args:
timeout: Max seconds to wait (None = block forever)
Returns:
Chunk or None (if sentinel received, meaning queue is closed)
Raises:
queue.Empty: If timeout expires while queue is empty
"""
item = self._queue.get(timeout=timeout)
if item is _SENTINEL:
# Re-put sentinel so other workers also see it
self._queue.put(_SENTINEL)
return None
return item
def close(self) -> None:
"""Signal all consumers to stop by inserting a sentinel."""
self._closed = True
self._queue.put(_SENTINEL)
@property
def is_closed(self) -> bool:
return self._closed
def qsize(self) -> int:
"""Current number of items in the queue (approximate)."""
return self._queue.qsize()

141
core/chunker/worker.py Normal file
View File

@@ -0,0 +1,141 @@
"""
Worker — pulls chunks from queue, processes with retry logic.
Demonstrates:
- Exception handling and resilient code (Interview Topic 7)
- Concurrency (Interview Topic 2) — workers run in thread pool
"""
import logging
import queue
import time
from typing import Any, Callable, Dict, Optional
from .exceptions import ProcessorFailureError
from .models import Chunk, ChunkResult
from .processor import Processor
from .queue import ChunkQueue
logger = logging.getLogger(__name__)
class Worker:
"""
Processes chunks from a queue with retry and exponential backoff.
Args:
worker_id: Identifier for this worker (e.g. "worker-0")
chunk_queue: Source queue to pull chunks from
processor: Processor instance to use
max_retries: Maximum retry attempts per chunk (default: 3)
event_callback: Optional callback for real-time status updates
"""
def __init__(
self,
worker_id: str,
chunk_queue: ChunkQueue,
processor: Processor,
max_retries: int = 3,
event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
):
self.worker_id = worker_id
self.chunk_queue = chunk_queue
self.processor = processor
self.max_retries = max_retries
self.event_callback = event_callback
self.processed_count = 0
self.error_count = 0
self.retry_count = 0
def _emit(self, event_type: str, data: Dict[str, Any]) -> None:
"""Emit an event if callback is registered."""
if self.event_callback:
self.event_callback(event_type, {"worker_id": self.worker_id, **data})
def _process_with_retry(self, chunk: Chunk) -> ChunkResult:
"""
Process a chunk with exponential backoff retry.
Retry delays: 0.1s, 0.2s, 0.4s, ... (doubles each attempt)
"""
last_error = None
for attempt in range(self.max_retries + 1):
try:
if attempt > 0:
backoff = 0.1 * (2 ** (attempt - 1))
self._emit("chunk_retry", {
"sequence": chunk.sequence,
"attempt": attempt,
"backoff": backoff,
})
time.sleep(backoff)
self.retry_count += 1
result = self.processor.process(chunk)
result.retries = attempt
result.worker_id = self.worker_id
return result
except Exception as e:
last_error = e
logger.warning(
f"{self.worker_id}: chunk {chunk.sequence} "
f"attempt {attempt + 1}/{self.max_retries + 1} failed: {e}"
)
# All retries exhausted
self.error_count += 1
self._emit("chunk_error", {
"sequence": chunk.sequence,
"error": str(last_error),
"retries": self.max_retries,
})
return ChunkResult(
sequence=chunk.sequence,
success=False,
processing_time=0.0,
error=str(last_error),
retries=self.max_retries,
worker_id=self.worker_id,
)
def run(self) -> list[ChunkResult]:
"""
Main worker loop — pull chunks and process until queue is closed.
Returns:
List of ChunkResults processed by this worker
"""
results = []
self._emit("worker_status", {"state": "idle"})
while True:
try:
chunk = self.chunk_queue.get(timeout=1.0)
except queue.Empty:
continue
if chunk is None: # Sentinel received
break
self._emit("chunk_processing", {
"sequence": chunk.sequence,
"state": "processing",
})
result = self._process_with_retry(chunk)
results.append(result)
self.processed_count += 1
self._emit("chunk_done", {
"sequence": chunk.sequence,
"success": result.success,
"processing_time": result.processing_time,
"retries": result.retries,
})
self._emit("worker_status", {"state": "stopped"})
return results

15
core/jobs/__init__.py Normal file
View File

@@ -0,0 +1,15 @@
"""
MPR Jobs Module
Provides executor abstraction and task dispatch for job processing.
"""
from .executor import Executor, LocalExecutor, get_executor
from .task import run_job
__all__ = [
"Executor",
"LocalExecutor",
"get_executor",
"run_job",
]

View File

@@ -1,17 +1,16 @@
""" """
Executor abstraction for job processing. Executor abstraction for job processing.
Supports different backends: Determines WHERE jobs run:
- LocalExecutor: FFmpeg via Celery (default) - LocalExecutor: delegates to registered Handler (default)
- LambdaExecutor: AWS Lambda (future) - LambdaExecutor: AWS Step Functions
- GCPExecutor: Google Cloud Run Jobs
""" """
import os import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, Optional from typing import Any, Callable, Dict, Optional
from core.ffmpeg.transcode import TranscodeConfig, transcode
# Configuration from environment # Configuration from environment
MPR_EXECUTOR = os.environ.get("MPR_EXECUTOR", "local") MPR_EXECUTOR = os.environ.get("MPR_EXECUTOR", "local")
@@ -22,26 +21,18 @@ class Executor(ABC):
@abstractmethod @abstractmethod
def run( def run(
self, self,
job_type: str,
job_id: str, job_id: str,
source_path: str, payload: Dict[str, Any],
output_path: str,
preset: Optional[Dict[str, Any]] = None,
trim_start: Optional[float] = None,
trim_end: Optional[float] = None,
duration: Optional[float] = None,
progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None, progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None,
) -> bool: ) -> bool:
""" """
Execute a transcode/trim job. Execute a job.
Args: Args:
job_type: Type of job ("transcode", "chunk", etc.)
job_id: Unique job identifier job_id: Unique job identifier
source_path: Path to source file payload: Job-type-specific configuration dict
output_path: Path for output file
preset: Transcode preset dict (optional, None = trim only)
trim_start: Trim start time in seconds (optional)
trim_end: Trim end time in seconds (optional)
duration: Source duration in seconds (for progress calculation)
progress_callback: Called with (percent, details_dict) progress_callback: Called with (percent, details_dict)
Returns: Returns:
@@ -51,62 +42,25 @@ class Executor(ABC):
class LocalExecutor(Executor): class LocalExecutor(Executor):
"""Execute jobs locally using FFmpeg.""" """Execute jobs locally using registered handlers."""
def run( def run(
self, self,
job_type: str,
job_id: str, job_id: str,
source_path: str, payload: Dict[str, Any],
output_path: str,
preset: Optional[Dict[str, Any]] = None,
trim_start: Optional[float] = None,
trim_end: Optional[float] = None,
duration: Optional[float] = None,
progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None, progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None,
) -> bool: ) -> bool:
"""Execute job using local FFmpeg.""" """Execute job using the appropriate local handler."""
from .registry import get_handler
# Build config from preset or use stream copy for trim-only handler = get_handler(job_type)
if preset: result = handler.process(
config = TranscodeConfig( job_id=job_id,
input_path=source_path, payload=payload,
output_path=output_path, progress_callback=progress_callback,
video_codec=preset.get("video_codec", "libx264"),
video_bitrate=preset.get("video_bitrate"),
video_crf=preset.get("video_crf"),
video_preset=preset.get("video_preset"),
resolution=preset.get("resolution"),
framerate=preset.get("framerate"),
audio_codec=preset.get("audio_codec", "aac"),
audio_bitrate=preset.get("audio_bitrate"),
audio_channels=preset.get("audio_channels"),
audio_samplerate=preset.get("audio_samplerate"),
container=preset.get("container", "mp4"),
extra_args=preset.get("extra_args", []),
trim_start=trim_start,
trim_end=trim_end,
)
else:
# Trim-only: stream copy
config = TranscodeConfig(
input_path=source_path,
output_path=output_path,
video_codec="copy",
audio_codec="copy",
trim_start=trim_start,
trim_end=trim_end,
)
# Wrapper to convert float percent to int
def wrapped_callback(percent: float, details: Dict[str, Any]) -> None:
if progress_callback:
progress_callback(int(percent), details)
return transcode(
config,
duration=duration,
progress_callback=wrapped_callback if progress_callback else None,
) )
return result.get("status") == "completed"
class LambdaExecutor(Executor): class LambdaExecutor(Executor):
@@ -123,26 +77,18 @@ class LambdaExecutor(Executor):
def run( def run(
self, self,
job_type: str,
job_id: str, job_id: str,
source_path: str, payload: Dict[str, Any],
output_path: str,
preset: Optional[Dict[str, Any]] = None,
trim_start: Optional[float] = None,
trim_end: Optional[float] = None,
duration: Optional[float] = None,
progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None, progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None,
) -> bool: ) -> bool:
"""Start a Step Functions execution for this job.""" """Start a Step Functions execution for this job."""
import json import json
payload = { sfn_payload = {
"job_type": job_type,
"job_id": job_id, "job_id": job_id,
"source_key": source_path, **payload,
"output_key": output_path,
"preset": preset,
"trim_start": trim_start,
"trim_end": trim_end,
"duration": duration,
"callback_url": self.callback_url, "callback_url": self.callback_url,
"api_key": self.callback_api_key, "api_key": self.callback_api_key,
} }
@@ -150,10 +96,9 @@ class LambdaExecutor(Executor):
response = self.sfn.start_execution( response = self.sfn.start_execution(
stateMachineArn=self.state_machine_arn, stateMachineArn=self.state_machine_arn,
name=f"mpr-{job_id}", name=f"mpr-{job_id}",
input=json.dumps(payload), input=json.dumps(sfn_payload),
) )
# Store execution ARN on the job
execution_arn = response["executionArn"] execution_arn = response["executionArn"]
try: try:
from core.db import update_job_fields from core.db import update_job_fields
@@ -179,13 +124,9 @@ class GCPExecutor(Executor):
def run( def run(
self, self,
job_type: str,
job_id: str, job_id: str,
source_path: str, payload: Dict[str, Any],
output_path: str,
preset: Optional[Dict[str, Any]] = None,
trim_start: Optional[float] = None,
trim_end: Optional[float] = None,
duration: Optional[float] = None,
progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None, progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None,
) -> bool: ) -> bool:
"""Trigger a Cloud Run Job execution for this job.""" """Trigger a Cloud Run Job execution for this job."""
@@ -193,14 +134,10 @@ class GCPExecutor(Executor):
from google.cloud import run_v2 from google.cloud import run_v2
payload = { gcp_payload = {
"job_type": job_type,
"job_id": job_id, "job_id": job_id,
"source_key": source_path, **payload,
"output_key": output_path,
"preset": preset,
"trim_start": trim_start,
"trim_end": trim_end,
"duration": duration,
"callback_url": self.callback_url, "callback_url": self.callback_url,
"api_key": self.callback_api_key, "api_key": self.callback_api_key,
} }
@@ -216,7 +153,8 @@ class GCPExecutor(Executor):
run_v2.RunJobRequest.Overrides.ContainerOverride( run_v2.RunJobRequest.Overrides.ContainerOverride(
env=[ env=[
run_v2.EnvVar( run_v2.EnvVar(
name="MPR_JOB_PAYLOAD", value=json.dumps(payload) name="MPR_JOB_PAYLOAD",
value=json.dumps(gcp_payload),
) )
] ]
) )

View File

@@ -0,0 +1,5 @@
"""Job handlers — type-specific execution logic."""
from .base import Handler
__all__ = ["Handler"]

View File

@@ -0,0 +1,33 @@
"""
Base Handler ABC — defines the interface for job-type-specific execution logic.
A Handler knows HOW to execute a specific kind of job (transcode, chunk, etc.).
The Executor decides WHERE to run it (local, Lambda, GCP).
"""
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, Optional
class Handler(ABC):
"""Abstract base class for job handlers."""
@abstractmethod
def process(
self,
job_id: str,
payload: Dict[str, Any],
progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None,
) -> Dict[str, Any]:
"""
Execute job-specific logic.
Args:
job_id: Unique job identifier
payload: Job-type-specific configuration
progress_callback: Called with (percent, details_dict)
Returns:
Result dict with at least {"status": "completed"} or raises
"""
pass

119
core/jobs/handlers/chunk.py Normal file
View File

@@ -0,0 +1,119 @@
"""
ChunkHandler — job handler that wraps the chunker Pipeline.
Downloads source from S3/MinIO, runs FFmpeg chunking pipeline,
uploads mp4 segments + manifest back to S3/MinIO.
"""
import logging
import os
import shutil
import tempfile
from typing import Any, Callable, Dict, Optional
from core.chunker import Pipeline
from core.storage import BUCKET_IN, BUCKET_OUT, download_to_temp, upload_file
from .base import Handler
logger = logging.getLogger(__name__)
class ChunkHandler(Handler):
"""
Handles chunk processing jobs by delegating to the chunker Pipeline.
Expected payload keys:
source_key: str — S3 key of the source file in BUCKET_IN
chunk_duration: float — seconds per chunk (default: 10.0)
num_workers: int — concurrent workers (default: 4)
max_retries: int — retries per chunk (default: 3)
processor_type: str — "ffmpeg", "checksum", "simulated_decode", "composite"
queue_size: int — max queue depth (default: 10)
"""
def process(
self,
job_id: str,
payload: Dict[str, Any],
progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None,
) -> Dict[str, Any]:
source_key = payload["source_key"]
processor_type = payload.get("processor_type", "ffmpeg")
logger.info(f"ChunkHandler starting job {job_id}: {source_key}")
# Download source from S3/MinIO
tmp_source = download_to_temp(BUCKET_IN, source_key)
# Create temp output directory for chunks
tmp_output_dir = tempfile.mkdtemp(prefix=f"chunks-{job_id}-")
try:
def event_bridge(event_type: str, data: Dict[str, Any]) -> None:
"""Bridge pipeline events to the job progress callback."""
if progress_callback and event_type == "pipeline_complete":
progress_callback(100, data)
elif progress_callback and event_type == "chunk_done":
total = data.get("total_chunks", 1)
if total > 0:
pct = min(int((data.get("sequence", 0) + 1) / total * 100), 99)
progress_callback(pct, data)
pipeline = Pipeline(
source=tmp_source,
chunk_duration=payload.get("chunk_duration", 10.0),
num_workers=payload.get("num_workers", 4),
max_retries=payload.get("max_retries", 3),
processor_type=processor_type,
queue_size=payload.get("queue_size", 10),
event_callback=event_bridge,
output_dir=tmp_output_dir if processor_type == "ffmpeg" else None,
)
result = pipeline.run()
# Upload chunks + manifest to S3/MinIO
output_prefix = f"chunks/{job_id}"
uploaded_files = []
for chunk_file in result.chunk_files:
filename = os.path.basename(chunk_file)
output_key = f"{output_prefix}/{filename}"
upload_file(chunk_file, BUCKET_OUT, output_key)
uploaded_files.append(output_key)
logger.info(f"Uploaded {output_key}")
# Upload manifest
manifest_path = os.path.join(tmp_output_dir, "manifest.json")
if os.path.exists(manifest_path):
manifest_key = f"{output_prefix}/manifest.json"
upload_file(manifest_path, BUCKET_OUT, manifest_key)
uploaded_files.append(manifest_key)
logger.info(f"Uploaded {manifest_key}")
return {
"status": "completed" if result.failed == 0 else "completed_with_errors",
"total_chunks": result.total_chunks,
"processed": result.processed,
"failed": result.failed,
"retries": result.retries,
"elapsed_time": result.elapsed_time,
"throughput_mbps": result.throughput_mbps,
"worker_stats": result.worker_stats,
"errors": result.errors,
"chunks_in_order": result.chunks_in_order,
"output_prefix": output_prefix,
"uploaded_files": uploaded_files,
}
finally:
# Cleanup temp files
try:
os.unlink(tmp_source)
except OSError:
pass
try:
shutil.rmtree(tmp_output_dir, ignore_errors=True)
except OSError:
pass

View File

@@ -0,0 +1,104 @@
"""
TranscodeHandler — executes transcode/trim jobs using FFmpeg.
Extracted from the old tasks.py Celery task logic.
"""
import logging
import os
import tempfile
from pathlib import Path
from typing import Any, Callable, Dict, Optional
from core.ffmpeg.transcode import TranscodeConfig, transcode
from core.storage import BUCKET_IN, BUCKET_OUT, download_to_temp, upload_file
from .base import Handler
logger = logging.getLogger(__name__)
class TranscodeHandler(Handler):
"""Handle transcode and trim jobs via FFmpeg."""
def process(
self,
job_id: str,
payload: Dict[str, Any],
progress_callback: Optional[Callable[[int, Dict[str, Any]], None]] = None,
) -> Dict[str, Any]:
source_key = payload["source_key"]
output_key = payload["output_key"]
preset = payload.get("preset")
trim_start = payload.get("trim_start")
trim_end = payload.get("trim_end")
duration = payload.get("duration")
logger.info(f"TranscodeHandler: {source_key} -> {output_key}")
# Download source
tmp_source = download_to_temp(BUCKET_IN, source_key)
ext = Path(output_key).suffix or ".mp4"
fd, tmp_output = tempfile.mkstemp(suffix=ext)
os.close(fd)
try:
if preset:
config = TranscodeConfig(
input_path=tmp_source,
output_path=tmp_output,
video_codec=preset.get("video_codec", "libx264"),
video_bitrate=preset.get("video_bitrate"),
video_crf=preset.get("video_crf"),
video_preset=preset.get("video_preset"),
resolution=preset.get("resolution"),
framerate=preset.get("framerate"),
audio_codec=preset.get("audio_codec", "aac"),
audio_bitrate=preset.get("audio_bitrate"),
audio_channels=preset.get("audio_channels"),
audio_samplerate=preset.get("audio_samplerate"),
container=preset.get("container", "mp4"),
extra_args=preset.get("extra_args", []),
trim_start=trim_start,
trim_end=trim_end,
)
else:
config = TranscodeConfig(
input_path=tmp_source,
output_path=tmp_output,
video_codec="copy",
audio_codec="copy",
trim_start=trim_start,
trim_end=trim_end,
)
def wrapped_callback(percent: float, details: Dict[str, Any]) -> None:
if progress_callback:
progress_callback(int(percent), details)
success = transcode(
config,
duration=duration,
progress_callback=wrapped_callback if progress_callback else None,
)
if not success:
raise RuntimeError("Transcode returned False")
# Upload result
logger.info(f"Uploading {output_key} to {BUCKET_OUT}")
upload_file(tmp_output, BUCKET_OUT, output_key)
return {
"status": "completed",
"job_id": job_id,
"output_key": output_key,
}
finally:
for f in [tmp_source, tmp_output]:
try:
os.unlink(f)
except OSError:
pass

33
core/jobs/registry.py Normal file
View File

@@ -0,0 +1,33 @@
"""
Handler registry — maps job_type strings to Handler classes.
"""
from typing import Dict, Type
from .handlers.base import Handler
_handlers: Dict[str, Type[Handler]] = {}
def register_handler(job_type: str, handler_class: Type[Handler]) -> None:
"""Register a handler class for a job type."""
_handlers[job_type] = handler_class
def get_handler(job_type: str) -> Handler:
"""Get an instantiated handler for a job type."""
if job_type not in _handlers:
raise ValueError(f"Unknown job type: {job_type}")
return _handlers[job_type]()
def _register_defaults() -> None:
"""Register built-in handlers."""
from .handlers.chunk import ChunkHandler
from .handlers.transcode import TranscodeHandler
register_handler("transcode", TranscodeHandler)
register_handler("chunk", ChunkHandler)
_register_defaults()

64
core/jobs/task.py Normal file
View File

@@ -0,0 +1,64 @@
"""
Celery task for job processing.
Generic dispatcher — routes to the appropriate handler based on job_type.
"""
import logging
from typing import Any, Dict
from celery import shared_task
from core.rpc.server import update_job_progress
logger = logging.getLogger(__name__)
@shared_task(bind=True, max_retries=3, default_retry_delay=60)
def run_job(
self,
job_type: str,
job_id: str,
payload: Dict[str, Any],
) -> Dict[str, Any]:
"""
Generic Celery task — dispatches to the registered handler for job_type.
"""
logger.info(f"Starting {job_type} job {job_id}")
update_job_progress(job_id, progress=0, status="processing")
def progress_callback(percent: int, details: Dict[str, Any]) -> None:
update_job_progress(
job_id,
progress=percent,
current_time=details.get("time", 0.0),
status="processing",
)
try:
from .registry import get_handler
handler = get_handler(job_type)
result = handler.process(
job_id=job_id,
payload=payload,
progress_callback=progress_callback,
)
logger.info(f"Job {job_id} completed successfully")
update_job_progress(job_id, progress=100, status="completed")
return result
except Exception as e:
logger.exception(f"Job {job_id} failed: {e}")
update_job_progress(job_id, progress=0, status="failed", error=str(e))
if self.request.retries < self.max_retries:
raise self.retry(exc=e)
return {
"status": "failed",
"job_id": job_id,
"error": str(e),
}

View File

@@ -59,17 +59,24 @@ class WorkerServicer(worker_pb2_grpc.WorkerServiceServicer):
# Dispatch to Celery if available # Dispatch to Celery if available
if self.celery_app: if self.celery_app:
from core.task.tasks import run_transcode_job from core.jobs.task import run_job
task = run_transcode_job.delay( payload = {
job_id=job_id, "source_key": request.source_path,
source_path=request.source_path, "output_key": request.output_path,
output_path=request.output_path, "preset": preset,
preset=preset, "trim_start": request.trim_start
trim_start=request.trim_start
if request.HasField("trim_start") if request.HasField("trim_start")
else None, else None,
trim_end=request.trim_end if request.HasField("trim_end") else None, "trim_end": request.trim_end
if request.HasField("trim_end")
else None,
}
task = run_job.delay(
job_type="transcode",
job_id=job_id,
payload=payload,
) )
_active_jobs[job_id]["celery_task_id"] = task.id _active_jobs[job_id]["celery_task_id"] = task.id
@@ -197,11 +204,14 @@ def update_job_progress(
speed: float = 0.0, speed: float = 0.0,
status: str = "processing", status: str = "processing",
error: str = None, error: str = None,
**extra,
) -> None: ) -> None:
""" """
Update job progress (called from worker tasks). Update job progress (called from worker tasks).
Updates both the in-memory gRPC state and the Django database. Updates both the in-memory gRPC state and the Django database.
Extra kwargs are stored for chunker-specific fields (total_chunks,
processed_chunks, failed_chunks, throughput_mbps, etc.).
""" """
if job_id in _active_jobs: if job_id in _active_jobs:
_active_jobs[job_id].update( _active_jobs[job_id].update(
@@ -212,6 +222,7 @@ def update_job_progress(
"speed": speed, "speed": speed,
"status": status, "status": status,
"error": error, "error": error,
**extra,
} }
) )

View File

@@ -23,12 +23,12 @@ from .grpc import (
ProgressUpdate, ProgressUpdate,
WorkerStatus, WorkerStatus,
) )
from .jobs import JobStatus, TranscodeJob from .jobs import ChunkJob, ChunkJobStatus, JobStatus, TranscodeJob
from .media import AssetStatus, MediaAsset from .media import AssetStatus, MediaAsset
from .presets import BUILTIN_PRESETS, TranscodePreset from .presets import BUILTIN_PRESETS, TranscodePreset
# Core domain models - generates Django, Pydantic, TypeScript # Core domain models - generates Django, Pydantic, TypeScript
DATACLASSES = [MediaAsset, TranscodePreset, TranscodeJob] DATACLASSES = [MediaAsset, TranscodePreset, TranscodeJob, ChunkJob]
# API request/response models - generates TypeScript only (no Django) # API request/response models - generates TypeScript only (no Django)
# WorkerStatus from grpc.py is reused here # WorkerStatus from grpc.py is reused here
@@ -42,7 +42,7 @@ API_MODELS = [
] ]
# Status enums - included in generated code # Status enums - included in generated code
ENUMS = [AssetStatus, JobStatus] ENUMS = [AssetStatus, JobStatus, ChunkJobStatus]
# gRPC messages - generates Proto # gRPC messages - generates Proto
GRPC_MESSAGES = [ GRPC_MESSAGES = [
@@ -61,6 +61,7 @@ __all__ = [
"MediaAsset", "MediaAsset",
"TranscodePreset", "TranscodePreset",
"TranscodeJob", "TranscodeJob",
"ChunkJob",
# API Models # API Models
"CreateJobRequest", "CreateJobRequest",
"UpdateAssetRequest", "UpdateAssetRequest",
@@ -70,6 +71,7 @@ __all__ = [
# Enums # Enums
"AssetStatus", "AssetStatus",
"JobStatus", "JobStatus",
"ChunkJobStatus",
# gRPC # gRPC
"GRPC_SERVICE", "GRPC_SERVICE",
"JobRequest", "JobRequest",

View File

@@ -1,13 +1,14 @@
""" """
TranscodeJob Schema Definition Job Schema Definitions
Source of truth for job data model. Source of truth for job data models.
TranscodeJob and ChunkJob share common lifecycle fields by convention.
""" """
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from typing import Any, Dict, Optional from typing import Any, Dict, List, Optional
from uuid import UUID from uuid import UUID
@@ -77,3 +78,56 @@ class TranscodeJob:
return self.preset_id is None and ( return self.preset_id is None and (
self.trim_start is not None or self.trim_end is not None self.trim_start is not None or self.trim_end is not None
) )
class ChunkJobStatus(str, Enum):
"""Status of a chunk pipeline job."""
PENDING = "pending"
CHUNKING = "chunking"
PROCESSING = "processing"
COLLECTING = "collecting"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
@dataclass
class ChunkJob:
"""
A chunk pipeline job — splits a media file into chunks and processes them
through a concurrent worker pool.
"""
id: UUID
# Input
source_asset_id: UUID
# Configuration
chunk_duration: float = 10.0 # seconds
num_workers: int = 4
max_retries: int = 3
processor_type: str = "ffmpeg" # "ffmpeg", "checksum", "simulated_decode", "composite"
# Status & Progress
status: ChunkJobStatus = ChunkJobStatus.PENDING
progress: float = 0.0 # 0.0 to 100.0
total_chunks: int = 0
processed_chunks: int = 0
failed_chunks: int = 0
retry_count: int = 0
error_message: Optional[str] = None
# Result stats
throughput_mbps: Optional[float] = None
elapsed_seconds: Optional[float] = None
# Worker tracking
celery_task_id: Optional[str] = None
priority: int = 0 # Lower = higher priority
# Timestamps
created_at: Optional[datetime] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None

View File

@@ -1,15 +0,0 @@
"""
MPR Worker Module
Provides executor abstraction and Celery tasks for job processing.
"""
from .executor import Executor, LocalExecutor, get_executor
from .tasks import run_transcode_job
__all__ = [
"Executor",
"LocalExecutor",
"get_executor",
"run_transcode_job",
]

View File

@@ -1,105 +0,0 @@
"""
Celery tasks for job processing.
"""
import logging
import os
from typing import Any, Dict, Optional
from celery import shared_task
from core.storage import BUCKET_IN, BUCKET_OUT, download_to_temp, upload_file
from core.rpc.server import update_job_progress
from core.task.executor import get_executor
logger = logging.getLogger(__name__)
@shared_task(bind=True, queue="transcode", max_retries=3, default_retry_delay=60)
def run_transcode_job(
self,
job_id: str,
source_key: str,
output_key: str,
preset: Optional[Dict[str, Any]] = None,
trim_start: Optional[float] = None,
trim_end: Optional[float] = None,
duration: Optional[float] = None,
) -> Dict[str, Any]:
"""
Celery task to run a transcode/trim job.
Downloads source from S3, runs FFmpeg, uploads result to S3.
"""
logger.info(f"Starting job {job_id}: {source_key} -> {output_key}")
update_job_progress(job_id, progress=0, status="processing")
# Download source from S3 to temp file
logger.info(f"Downloading {source_key} from {BUCKET_IN}")
tmp_source = download_to_temp(BUCKET_IN, source_key)
# Create temp output path with same extension
import tempfile
from pathlib import Path
ext = Path(output_key).suffix or ".mp4"
fd, tmp_output = tempfile.mkstemp(suffix=ext)
os.close(fd)
def progress_callback(percent: int, details: Dict[str, Any]) -> None:
update_job_progress(
job_id,
progress=percent,
current_time=details.get("time", 0.0),
status="processing",
)
try:
executor = get_executor()
success = executor.run(
job_id=job_id,
source_path=tmp_source,
output_path=tmp_output,
preset=preset,
trim_start=trim_start,
trim_end=trim_end,
duration=duration,
progress_callback=progress_callback,
)
if success:
# Upload result to S3
logger.info(f"Uploading {output_key} to {BUCKET_OUT}")
upload_file(tmp_output, BUCKET_OUT, output_key)
logger.info(f"Job {job_id} completed successfully")
update_job_progress(job_id, progress=100, status="completed")
return {
"status": "completed",
"job_id": job_id,
"output_key": output_key,
}
else:
raise RuntimeError("Executor returned False")
except Exception as e:
logger.exception(f"Job {job_id} failed: {e}")
update_job_progress(job_id, progress=0, status="failed", error=str(e))
if self.request.retries < self.max_retries:
raise self.retry(exc=e)
return {
"status": "failed",
"job_id": job_id,
"error": str(e),
}
finally:
# Clean up temp files
for f in [tmp_source, tmp_output]:
try:
os.unlink(f)
except OSError:
pass

View File

@@ -5,6 +5,7 @@ WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
COPY . . # No COPY . . — code is volume-mounted in dev (..:/app)
# This image only provides the Python runtime + dependencies
CMD ["python", "admin/manage.py", "runserver", "0.0.0.0:8000"] CMD ["python", "admin/manage.py", "runserver", "0.0.0.0:8000"]

View File

@@ -9,6 +9,7 @@ WORKDIR /app
COPY requirements.txt requirements-worker.txt ./ COPY requirements.txt requirements-worker.txt ./
RUN pip install --no-cache-dir -r requirements-worker.txt RUN pip install --no-cache-dir -r requirements-worker.txt
COPY . . # No COPY . . — code is volume-mounted in dev (..:/app)
# This image only provides Python runtime + FFmpeg + dependencies
CMD ["celery", "-A", "admin.mpr", "worker", "--loglevel=info"] CMD ["celery", "-A", "admin.mpr", "worker", "--loglevel=info"]

View File

@@ -17,6 +17,20 @@ x-healthcheck-defaults: &healthcheck-defaults
timeout: 5s timeout: 5s
retries: 5 retries: 5
x-python-service: &python-service
build:
context: ..
dockerfile: ctrl/Dockerfile
volumes:
- ..:/app
environment:
<<: *common-env
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
services: services:
# ============================================================================= # =============================================================================
# Infrastructure # Infrastructure
@@ -92,47 +106,25 @@ services:
# ============================================================================= # =============================================================================
django: django:
build: <<: *python-service
context: ..
dockerfile: ctrl/Dockerfile
command: > command: >
bash -c "python admin/manage.py migrate && bash -c "python admin/manage.py migrate &&
python admin/manage.py loadbuiltins || true && python admin/manage.py loadbuiltins || true &&
python admin/manage.py runserver 0.0.0.0:8701" python admin/manage.py runserver 0.0.0.0:8701"
ports: ports:
- "8701:8701" - "8701:8701"
environment:
<<: *common-env
volumes:
- ..:/app
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
fastapi: fastapi:
build: <<: *python-service
context: ..
dockerfile: ctrl/Dockerfile
command: uvicorn core.api.main:app --host 0.0.0.0 --port 8702 --reload command: uvicorn core.api.main:app --host 0.0.0.0 --port 8702 --reload
ports: ports:
- "8702:8702" - "8702:8702"
environment: environment:
<<: *common-env <<: *common-env
DJANGO_ALLOW_ASYNC_UNSAFE: "true" DJANGO_ALLOW_ASYNC_UNSAFE: "true"
volumes:
- ..:/app
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
grpc: grpc:
build: <<: *python-service
context: ..
dockerfile: ctrl/Dockerfile
command: python -m core.rpc.server command: python -m core.rpc.server
ports: ports:
- "50052:50051" - "50052:50051"
@@ -140,13 +132,6 @@ services:
<<: *common-env <<: *common-env
GRPC_PORT: 50051 GRPC_PORT: 50051
GRPC_MAX_WORKERS: 10 GRPC_MAX_WORKERS: 10
volumes:
- ..:/app
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
celery: celery:
build: build:

View File

@@ -14,8 +14,8 @@ COPY ctrl/lambda/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
# Copy application code # Copy application code
COPY core/task/lambda_handler.py ${LAMBDA_TASK_ROOT}/core/task/lambda_handler.py COPY core/jobs/lambda_handler.py ${LAMBDA_TASK_ROOT}/core/jobs/lambda_handler.py
COPY core/task/__init__.py ${LAMBDA_TASK_ROOT}/core/task/__init__.py COPY core/jobs/__init__.py ${LAMBDA_TASK_ROOT}/core/jobs/__init__.py
COPY core/ ${LAMBDA_TASK_ROOT}/core/ COPY core/ ${LAMBDA_TASK_ROOT}/core/
CMD ["core.task.lambda_handler.handler"] CMD ["core.jobs.lambda_handler.handler"]

0
tests/__init__.py Normal file
View File

View File

76
tests/chunker/conftest.py Normal file
View File

@@ -0,0 +1,76 @@
"""
Shared fixtures for chunker tests.
Demonstrates: TDD and unit testing best practices (Interview Topic 8) — fixtures, temp files.
"""
import os
import tempfile
import pytest
from core.chunker.models import Chunk, ChunkResult
@pytest.fixture
def temp_file():
"""Create a temporary file with known content, cleaned up after test."""
files = []
def _create(content: bytes = b"x" * 4096):
f = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
f.write(content)
f.close()
files.append(f.name)
return f.name
yield _create
for path in files:
if os.path.exists(path):
os.unlink(path)
@pytest.fixture
def sample_chunk(temp_file):
"""Create a sample time-based Chunk with valid time range."""
path = temp_file(b"x" * 1024)
return Chunk(
sequence=0,
start_time=0.0,
end_time=10.0,
source_path=path,
duration=10.0,
)
@pytest.fixture
def make_chunk(temp_file):
"""Factory fixture for creating time-based chunks with specific sequence numbers."""
path = temp_file(b"x" * 1024)
def _make(sequence: int, duration: float = 10.0) -> Chunk:
start = sequence * duration
return Chunk(
sequence=sequence,
start_time=start,
end_time=start + duration,
source_path=path,
duration=duration,
)
return _make
@pytest.fixture
def make_result():
"""Factory fixture for creating ChunkResults."""
def _make(sequence: int, success: bool = True, processing_time: float = 0.01) -> ChunkResult:
return ChunkResult(
sequence=sequence,
success=success,
processing_time=processing_time,
)
return _make

View File

@@ -0,0 +1,149 @@
"""
Tests for Chunker — time-based segmentation, chunk counts, sequence numbers, generator behavior.
Demonstrates: TDD (Interview Topic 8) — parametrized tests, edge cases, mocking.
"""
from unittest.mock import patch, MagicMock
import pytest
from core.chunker import Chunker
from core.chunker.exceptions import ChunkReadError
def mock_probe(duration):
"""Create a mock probe_file that returns the given duration."""
result = MagicMock()
result.duration = duration
return result
class TestChunker:
@patch("core.chunker.chunker.probe_file")
def test_basic_chunking(self, mock_pf, temp_file):
"""File splits into expected number of time-based chunks."""
path = temp_file(b"x" * 1000)
mock_pf.return_value = mock_probe(30.0)
chunker = Chunker(path, chunk_duration=10.0)
chunks = list(chunker.chunks())
assert len(chunks) == 3
assert chunks[0].start_time == 0.0
assert chunks[0].end_time == 10.0
assert chunks[0].duration == 10.0
assert chunks[1].start_time == 10.0
assert chunks[2].start_time == 20.0
@patch("core.chunker.chunker.probe_file")
def test_sequence_numbers(self, mock_pf, temp_file):
"""Chunks have sequential sequence numbers starting at 0."""
path = temp_file(b"x" * 100)
mock_pf.return_value = mock_probe(40.0)
chunker = Chunker(path, chunk_duration=10.0)
chunks = list(chunker.chunks())
sequences = [c.sequence for c in chunks]
assert sequences == [0, 1, 2, 3]
@patch("core.chunker.chunker.probe_file")
def test_time_ranges(self, mock_pf, temp_file):
"""Each chunk has correct start_time and end_time."""
path = temp_file(b"x" * 100)
mock_pf.return_value = mock_probe(25.0)
chunker = Chunker(path, chunk_duration=10.0)
chunks = list(chunker.chunks())
assert chunks[0].start_time == 0.0
assert chunks[0].end_time == 10.0
assert chunks[1].start_time == 10.0
assert chunks[1].end_time == 20.0
assert chunks[2].start_time == 20.0
assert chunks[2].end_time == 25.0 # last chunk shorter
assert chunks[2].duration == 5.0
@patch("core.chunker.chunker.probe_file")
def test_expected_chunks_property(self, mock_pf, temp_file):
"""expected_chunks calculates correctly before iteration."""
path = temp_file(b"x" * 100)
mock_pf.return_value = mock_probe(25.0)
chunker = Chunker(path, chunk_duration=10.0)
assert chunker.expected_chunks == 3 # ceil(25/10)
@patch("core.chunker.chunker.probe_file")
def test_source_path_on_chunks(self, mock_pf, temp_file):
"""Each chunk carries the source file path."""
path = temp_file(b"x" * 100)
mock_pf.return_value = mock_probe(10.0)
chunker = Chunker(path, chunk_duration=10.0)
chunks = list(chunker.chunks())
assert all(c.source_path == path for c in chunks)
def test_file_not_found(self):
"""Non-existent file raises ChunkReadError."""
with pytest.raises(ChunkReadError, match="File not found"):
Chunker("/nonexistent/file.mp4")
@patch("core.chunker.chunker.probe_file")
def test_invalid_chunk_duration(self, mock_pf, temp_file):
"""Zero or negative chunk_duration raises ValueError."""
path = temp_file(b"x" * 100)
with pytest.raises(ValueError, match="chunk_duration must be positive"):
Chunker(path, chunk_duration=0)
with pytest.raises(ValueError, match="chunk_duration must be positive"):
Chunker(path, chunk_duration=-1)
@patch("core.chunker.chunker.probe_file")
def test_generator_laziness(self, mock_pf, temp_file):
"""Chunks are yielded lazily, not pre-loaded."""
path = temp_file(b"x" * 100)
mock_pf.return_value = mock_probe(30.0)
chunker = Chunker(path, chunk_duration=10.0)
gen = chunker.chunks()
first = next(gen)
assert first.sequence == 0
# Generator is not exhausted — remaining chunks still pending
@pytest.mark.parametrize("duration,chunk_dur,expected", [
(10.0, 10.0, 1),
(10.1, 10.0, 2),
(1.0, 1.0, 1),
(100.0, 1.0, 100),
(5.0, 100.0, 1),
])
@patch("core.chunker.chunker.probe_file")
def test_expected_chunks_parametrized(self, mock_pf, temp_file, duration, chunk_dur, expected):
"""Parametrized: various duration/chunk_duration combos."""
path = temp_file(b"x" * 100)
mock_pf.return_value = mock_probe(duration)
chunker = Chunker(path, chunk_duration=chunk_dur)
assert chunker.expected_chunks == expected
@patch("core.chunker.chunker.probe_file")
def test_exact_multiple(self, mock_pf, temp_file):
"""Duration exactly divisible by chunk_duration."""
path = temp_file(b"x" * 100)
mock_pf.return_value = mock_probe(30.0)
chunker = Chunker(path, chunk_duration=10.0)
chunks = list(chunker.chunks())
assert len(chunks) == 3
assert all(c.duration == 10.0 for c in chunks)
@patch("core.chunker.chunker.probe_file")
def test_probe_failure(self, mock_pf, temp_file):
"""Probe failure raises ChunkReadError."""
path = temp_file(b"x" * 100)
mock_pf.side_effect = Exception("ffprobe failed")
with pytest.raises(ChunkReadError, match="Failed to probe"):
Chunker(path, chunk_duration=10.0)

View File

@@ -0,0 +1,103 @@
"""
Tests for ResultCollector — ordered reassembly, out-of-order buffering, duplicates.
Demonstrates: TDD (Interview Topic 8) — testing algorithms (heapq reassembly).
"""
import pytest
from core.chunker.collector import ResultCollector
from core.chunker.exceptions import ReassemblyError
class TestResultCollector:
def test_in_order_emission(self, make_result):
"""Results arriving in order are emitted immediately."""
collector = ResultCollector(total_chunks=3)
emitted = collector.add(make_result(0))
assert len(emitted) == 1
assert emitted[0].sequence == 0
emitted = collector.add(make_result(1))
assert len(emitted) == 1
emitted = collector.add(make_result(2))
assert len(emitted) == 1
assert collector.is_complete
def test_out_of_order_buffering(self, make_result):
"""Out-of-order results are buffered until gaps fill."""
collector = ResultCollector(total_chunks=3)
# Arrive: 2, 0, 1
emitted = collector.add(make_result(2))
assert len(emitted) == 0
assert collector.buffered_count == 1
emitted = collector.add(make_result(0))
assert len(emitted) == 1 # Only 0 emitted, 1 still missing
emitted = collector.add(make_result(1))
assert len(emitted) == 2 # 1 and 2 now emittable
assert collector.is_complete
def test_reverse_order(self, make_result):
"""All results arrive in reverse — only last add emits everything."""
collector = ResultCollector(total_chunks=4)
for seq in [3, 2, 1]:
emitted = collector.add(make_result(seq))
assert len(emitted) == 0
emitted = collector.add(make_result(0))
assert len(emitted) == 4
assert collector.is_complete
def test_duplicate_raises(self, make_result):
"""Duplicate sequence number raises ReassemblyError."""
collector = ResultCollector(total_chunks=3)
collector.add(make_result(0))
with pytest.raises(ReassemblyError, match="Duplicate"):
collector.add(make_result(0))
def test_emitted_count(self, make_result):
"""emitted_count tracks correctly."""
collector = ResultCollector(total_chunks=3)
assert collector.emitted_count == 0
collector.add(make_result(0))
assert collector.emitted_count == 1
collector.add(make_result(2)) # buffered
assert collector.emitted_count == 1
collector.add(make_result(1)) # releases 1 and 2
assert collector.emitted_count == 3
def test_get_ordered_results(self, make_result):
"""get_ordered_results returns all emitted results in order."""
collector = ResultCollector(total_chunks=3)
collector.add(make_result(2))
collector.add(make_result(0))
collector.add(make_result(1))
ordered = collector.get_ordered_results()
assert [r.sequence for r in ordered] == [0, 1, 2]
def test_avg_processing_time(self, make_result):
"""Average processing time from sliding window."""
collector = ResultCollector(total_chunks=2)
collector.add(make_result(0, processing_time=0.1))
collector.add(make_result(1, processing_time=0.3))
assert abs(collector.avg_processing_time - 0.2) < 0.001
def test_not_complete_when_partial(self, make_result):
"""is_complete is False until all chunks emitted."""
collector = ResultCollector(total_chunks=3)
collector.add(make_result(0))
collector.add(make_result(1))
assert not collector.is_complete

View File

@@ -0,0 +1,69 @@
"""
Tests for exception hierarchy — catch patterns, attributes.
Demonstrates: TDD (Interview Topic 8) — testing exception design.
"""
import pytest
from core.chunker.exceptions import (
ChunkChecksumError,
ChunkError,
ChunkReadError,
PipelineError,
ProcessingError,
ProcessorFailureError,
ProcessorTimeoutError,
ReassemblyError,
)
class TestExceptionHierarchy:
"""Verify the exception class hierarchy and catch patterns."""
def test_pipeline_error_is_base(self):
"""All chunker exceptions inherit from PipelineError."""
assert issubclass(ChunkError, PipelineError)
assert issubclass(ProcessingError, PipelineError)
assert issubclass(ReassemblyError, PipelineError)
def test_chunk_error_subtypes(self):
"""ChunkReadError and ChunkChecksumError are ChunkErrors."""
assert issubclass(ChunkReadError, ChunkError)
assert issubclass(ChunkChecksumError, ChunkError)
def test_processing_error_subtypes(self):
"""ProcessorTimeoutError and ProcessorFailureError are ProcessingErrors."""
assert issubclass(ProcessorTimeoutError, ProcessingError)
assert issubclass(ProcessorFailureError, ProcessingError)
def test_catch_pipeline_error_catches_all(self):
"""Catching PipelineError catches any subtype."""
with pytest.raises(PipelineError):
raise ChunkReadError("test")
with pytest.raises(PipelineError):
raise ReassemblyError("test")
def test_checksum_error_attributes(self):
"""ChunkChecksumError carries sequence, expected, actual."""
err = ChunkChecksumError(sequence=5, expected="aaa", actual="bbb")
assert err.sequence == 5
assert err.expected == "aaa"
assert err.actual == "bbb"
assert "5" in str(err)
def test_timeout_error_attributes(self):
"""ProcessorTimeoutError carries sequence and timeout."""
err = ProcessorTimeoutError(sequence=3, timeout=30.0)
assert err.sequence == 3
assert err.timeout == 30.0
def test_failure_error_attributes(self):
"""ProcessorFailureError carries sequence, retries, original error."""
original = RuntimeError("boom")
err = ProcessorFailureError(sequence=1, retries=3, original_error=original)
assert err.sequence == 1
assert err.retries == 3
assert err.original_error is original
assert "boom" in str(err)

View File

@@ -0,0 +1,144 @@
"""
Tests for Pipeline — end-to-end orchestration, stats, error handling.
Demonstrates: TDD (Interview Topic 8) — integration testing with mocked FFmpeg probe.
"""
from unittest.mock import MagicMock, patch
import pytest
from core.chunker import Pipeline
from core.chunker.exceptions import PipelineError
def mock_probe(duration):
"""Create a mock ProbeResult with the given duration."""
result = MagicMock()
result.duration = duration
return result
class TestPipeline:
@patch("core.chunker.chunker.probe_file")
def test_end_to_end(self, mock_pf, temp_file):
"""Full pipeline processes a file successfully."""
path = temp_file(b"x" * 4096)
mock_pf.return_value = mock_probe(40.0)
result = Pipeline(
source=path,
chunk_duration=10.0,
num_workers=2,
processor_type="checksum",
).run()
assert result.total_chunks == 4
assert result.processed == 4
assert result.failed == 0
assert result.elapsed_time > 0
assert result.chunks_in_order is True
@patch("core.chunker.chunker.probe_file")
def test_throughput_calculated(self, mock_pf, temp_file):
"""Pipeline calculates throughput."""
path = temp_file(b"x" * 10000)
mock_pf.return_value = mock_probe(30.0)
result = Pipeline(source=path, chunk_duration=10.0, num_workers=2).run()
assert result.throughput_mbps > 0
@patch("core.chunker.chunker.probe_file")
def test_worker_stats(self, mock_pf, temp_file):
"""Pipeline reports per-worker stats."""
path = temp_file(b"x" * 4000)
mock_pf.return_value = mock_probe(40.0)
result = Pipeline(
source=path, chunk_duration=10.0, num_workers=2
).run()
assert len(result.worker_stats) == 2
for worker_id, stats in result.worker_stats.items():
assert "processed" in stats
assert "errors" in stats
def test_nonexistent_file(self):
"""Non-existent file raises PipelineError."""
with pytest.raises(PipelineError):
Pipeline(source="/nonexistent/file.mp4").run()
@patch("core.chunker.chunker.probe_file")
def test_event_callback(self, mock_pf, temp_file):
"""Pipeline emits events through callback."""
path = temp_file(b"x" * 2048)
mock_pf.return_value = mock_probe(20.0)
events = []
def capture(event_type, data):
events.append(event_type)
Pipeline(
source=path,
chunk_duration=10.0,
num_workers=1,
event_callback=capture,
).run()
assert "pipeline_start" in events
assert "pipeline_complete" in events
assert "chunk_queued" in events
@patch("core.chunker.chunker.probe_file")
def test_simulated_decode_processor(self, mock_pf, temp_file):
"""Pipeline works with simulated_decode processor."""
path = temp_file(b"x" * 2048)
mock_pf.return_value = mock_probe(20.0)
result = Pipeline(
source=path,
chunk_duration=10.0,
num_workers=2,
processor_type="simulated_decode",
).run()
assert result.total_chunks == 2
assert result.failed == 0
@patch("core.chunker.chunker.probe_file")
def test_single_chunk_file(self, mock_pf, temp_file):
"""Duration shorter than chunk_duration produces one chunk."""
path = temp_file(b"x" * 100)
mock_pf.return_value = mock_probe(5.0)
result = Pipeline(source=path, chunk_duration=10.0).run()
assert result.total_chunks == 1
assert result.processed == 1
@patch("core.chunker.chunker.probe_file")
def test_retries_tracked(self, mock_pf, temp_file):
"""Pipeline result tracks total retries."""
path = temp_file(b"x" * 2048)
mock_pf.return_value = mock_probe(20.0)
result = Pipeline(source=path, chunk_duration=10.0).run()
assert result.retries >= 0 # Might be 0 if no failures
@patch("core.chunker.chunker.probe_file")
def test_output_dir_and_chunk_files(self, mock_pf, temp_file):
"""Pipeline tracks output_dir and chunk_files when set."""
path = temp_file(b"x" * 1024)
mock_pf.return_value = mock_probe(10.0)
result = Pipeline(
source=path,
chunk_duration=10.0,
processor_type="checksum",
).run()
# No output_dir set, so chunk_files should be empty
assert result.output_dir is None
assert result.chunk_files == []

View File

@@ -0,0 +1,98 @@
"""
Tests for Processor implementations — ChecksumProcessor, SimulatedDecodeProcessor, CompositeProcessor.
Demonstrates: TDD (Interview Topic 8) — ABC contract, parametrized tests.
"""
import pytest
from core.chunker.exceptions import ChunkChecksumError
from core.chunker.models import Chunk
from core.chunker.processor import (
ChecksumProcessor,
CompositeProcessor,
Processor,
SimulatedDecodeProcessor,
)
class TestChecksumProcessor:
def test_valid_time_range(self, sample_chunk):
"""Valid time range passes."""
proc = ChecksumProcessor()
result = proc.process(sample_chunk)
assert result.success is True
assert result.checksum_valid is True
assert result.processing_time > 0
def test_invalid_time_range(self):
"""Invalid time range raises ChunkChecksumError."""
chunk = Chunk(
sequence=0,
start_time=10.0,
end_time=10.0, # zero duration
source_path="/fake.mp4",
duration=0.0,
)
proc = ChecksumProcessor()
with pytest.raises(ChunkChecksumError) as exc_info:
proc.process(chunk)
assert exc_info.value.sequence == 0
def test_sequence_preserved(self, make_chunk):
"""Result carries the chunk's sequence number."""
chunk = make_chunk(42)
proc = ChecksumProcessor()
result = proc.process(chunk)
assert result.sequence == 42
class TestSimulatedDecodeProcessor:
def test_processes_successfully(self, sample_chunk):
"""Simulated decode always succeeds."""
proc = SimulatedDecodeProcessor(ms_per_second=1.0)
result = proc.process(sample_chunk)
assert result.success is True
assert result.processing_time > 0
def test_time_proportional_to_duration(self):
"""Longer chunks take longer."""
short = Chunk(0, 0.0, 1.0, "/fake.mp4", 1.0)
long = Chunk(1, 0.0, 10.0, "/fake.mp4", 10.0)
proc = SimulatedDecodeProcessor(ms_per_second=50.0)
r_short = proc.process(short)
r_long = proc.process(long)
assert r_long.processing_time > r_short.processing_time
class TestCompositeProcessor:
def test_chains_processors(self, sample_chunk):
"""Composite runs all processors in sequence."""
proc = CompositeProcessor([
ChecksumProcessor(),
SimulatedDecodeProcessor(ms_per_second=1.0),
])
result = proc.process(sample_chunk)
assert result.success is True
def test_stops_on_failure(self):
"""If first processor raises, composite propagates the error."""
bad_chunk = Chunk(0, 10.0, 10.0, "/fake.mp4", 0.0) # invalid range
proc = CompositeProcessor([
ChecksumProcessor(),
SimulatedDecodeProcessor(ms_per_second=1.0),
])
with pytest.raises(ChunkChecksumError):
proc.process(bad_chunk)
def test_requires_at_least_one(self):
"""Empty processor list raises ValueError."""
with pytest.raises(ValueError, match="at least one"):
CompositeProcessor([])
def test_is_processor(self):
"""CompositeProcessor is a Processor."""
proc = CompositeProcessor([ChecksumProcessor()])
assert isinstance(proc, Processor)

115
tests/chunker/test_queue.py Normal file
View File

@@ -0,0 +1,115 @@
"""
Tests for ChunkQueue — backpressure, sentinel shutdown, timeout behavior.
Demonstrates: TDD (Interview Topic 8) — concurrency testing.
"""
import queue
import threading
import pytest
from core.chunker.queue import ChunkQueue
class TestChunkQueue:
def test_put_and_get(self, make_chunk):
"""Basic put/get cycle."""
q = ChunkQueue(maxsize=5)
chunk = make_chunk(0)
q.put(chunk)
result = q.get(timeout=1.0)
assert result.sequence == 0
def test_fifo_order(self, make_chunk):
"""Items come out in FIFO order."""
q = ChunkQueue(maxsize=5)
for i in range(3):
q.put(make_chunk(i))
for i in range(3):
assert q.get(timeout=1.0).sequence == i
def test_close_returns_none(self, make_chunk):
"""After close(), get() returns None (sentinel)."""
q = ChunkQueue(maxsize=5)
q.put(make_chunk(0))
q.close()
result = q.get(timeout=1.0)
assert result.sequence == 0
# Next get should hit sentinel
result = q.get(timeout=1.0)
assert result is None
def test_close_propagates_to_multiple_consumers(self, make_chunk):
"""Sentinel propagates: multiple consumers all get None."""
q = ChunkQueue(maxsize=5)
q.close()
# Multiple consumers should all see None
assert q.get(timeout=1.0) is None
assert q.get(timeout=1.0) is None
def test_is_closed(self):
"""is_closed reflects state."""
q = ChunkQueue()
assert not q.is_closed
q.close()
assert q.is_closed
def test_qsize(self, make_chunk):
"""qsize tracks approximate queue depth."""
q = ChunkQueue(maxsize=10)
assert q.qsize() == 0
q.put(make_chunk(0))
q.put(make_chunk(1))
assert q.qsize() == 2
q.get(timeout=1.0)
assert q.qsize() == 1
def test_backpressure_blocks(self, make_chunk):
"""Put blocks when queue is full (backpressure)."""
q = ChunkQueue(maxsize=2)
q.put(make_chunk(0))
q.put(make_chunk(1))
# Queue is full — put with short timeout should raise
with pytest.raises(queue.Full):
q.put(make_chunk(2), timeout=0.05)
def test_get_timeout(self):
"""Get on empty queue with timeout raises Empty."""
q = ChunkQueue(maxsize=5)
with pytest.raises(queue.Empty):
q.get(timeout=0.05)
def test_concurrent_put_get(self, make_chunk):
"""Producer/consumer threads work correctly."""
q = ChunkQueue(maxsize=3)
results = []
def producer():
for i in range(10):
q.put(make_chunk(i))
q.close()
def consumer():
while True:
item = q.get(timeout=2.0)
if item is None:
break
results.append(item.sequence)
t1 = threading.Thread(target=producer)
t2 = threading.Thread(target=consumer)
t1.start()
t2.start()
t1.join(timeout=5.0)
t2.join(timeout=5.0)
assert sorted(results) == list(range(10))

View File

@@ -0,0 +1,127 @@
"""
Tests for Worker — processing, retry with backoff, error handling.
Demonstrates: TDD (Interview Topic 8) — mocking processors, testing retry logic.
"""
from unittest.mock import MagicMock
import pytest
from core.chunker.models import Chunk, ChunkResult
from core.chunker.processor import Processor
from core.chunker.queue import ChunkQueue
from core.chunker.worker import Worker
class FailNTimesProcessor(Processor):
"""Test processor that fails N times then succeeds."""
def __init__(self, fail_count: int):
self.fail_count = fail_count
self.call_count = 0
def process(self, chunk: Chunk) -> ChunkResult:
self.call_count += 1
if self.call_count <= self.fail_count:
raise RuntimeError(f"Simulated failure #{self.call_count}")
return ChunkResult(
sequence=chunk.sequence,
success=True,
processing_time=0.001,
)
class AlwaysFailProcessor(Processor):
"""Test processor that always fails."""
def process(self, chunk: Chunk) -> ChunkResult:
raise RuntimeError("Always fails")
class TestWorker:
def test_processes_chunks(self, make_chunk):
"""Worker processes all chunks from queue."""
q = ChunkQueue(maxsize=5)
for i in range(3):
q.put(make_chunk(i))
q.close()
from core.chunker.processor import ChecksumProcessor
worker = Worker("w-0", q, ChecksumProcessor(), max_retries=0)
results = worker.run()
assert len(results) == 3
assert all(r.success for r in results)
def test_retry_on_failure(self, make_chunk):
"""Worker retries on processor failure."""
q = ChunkQueue(maxsize=5)
q.put(make_chunk(0))
q.close()
proc = FailNTimesProcessor(fail_count=2)
worker = Worker("w-0", q, proc, max_retries=3)
results = worker.run()
assert len(results) == 1
assert results[0].success is True
assert results[0].retries == 2
assert proc.call_count == 3 # 2 failures + 1 success
def test_max_retries_exceeded(self, make_chunk):
"""Worker gives up after max retries."""
q = ChunkQueue(maxsize=5)
q.put(make_chunk(0))
q.close()
worker = Worker("w-0", q, AlwaysFailProcessor(), max_retries=2)
results = worker.run()
assert len(results) == 1
assert results[0].success is False
assert results[0].error is not None
assert worker.error_count == 1
def test_worker_id_on_results(self, make_chunk):
"""Worker stamps its ID on results."""
q = ChunkQueue(maxsize=5)
q.put(make_chunk(0))
q.close()
from core.chunker.processor import ChecksumProcessor
worker = Worker("worker-7", q, ChecksumProcessor())
results = worker.run()
assert results[0].worker_id == "worker-7"
def test_event_callback(self, make_chunk):
"""Worker emits events via callback."""
q = ChunkQueue(maxsize=5)
q.put(make_chunk(0))
q.close()
events = []
callback = MagicMock(side_effect=lambda t, d: events.append((t, d)))
from core.chunker.processor import ChecksumProcessor
worker = Worker("w-0", q, ChecksumProcessor(), event_callback=callback)
worker.run()
event_types = [e[0] for e in events]
assert "worker_status" in event_types
assert "chunk_processing" in event_types
assert "chunk_done" in event_types
def test_processed_count(self, make_chunk):
"""Worker tracks processed count."""
q = ChunkQueue(maxsize=10)
for i in range(5):
q.put(make_chunk(i))
q.close()
from core.chunker.processor import ChecksumProcessor
worker = Worker("w-0", q, ChecksumProcessor())
worker.run()
assert worker.processed_count == 5

12
ui/chunker/index.html Normal file
View File

@@ -0,0 +1,12 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>MPR Chunker Pipeline</title>
</head>
<body>
<div id="app"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>

1729
ui/chunker/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

22
ui/chunker/package.json Normal file
View File

@@ -0,0 +1,22 @@
{
"name": "mpr-chunker",
"version": "0.1.0",
"private": true,
"type": "module",
"scripts": {
"dev": "vite",
"build": "tsc && vite build",
"preview": "vite preview"
},
"dependencies": {
"react": "^18.2.0",
"react-dom": "^18.2.0"
},
"devDependencies": {
"@types/react": "^18.2.0",
"@types/react-dom": "^18.2.0",
"@vitejs/plugin-react": "^4.2.0",
"typescript": "^5.3.0",
"vite": "^5.0.0"
}
}

735
ui/chunker/src/App.css Normal file
View File

@@ -0,0 +1,735 @@
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
"Fira Code", monospace, sans-serif;
background: #0f0f0f;
color: #e0e0e0;
font-size: 14px;
}
/* ---- Layout ---- */
.app {
display: flex;
flex-direction: column;
height: 100vh;
}
.header {
display: flex;
justify-content: space-between;
align-items: center;
padding: 0.75rem 1.25rem;
background: #1a1a1a;
border-bottom: 1px solid #2a2a2a;
}
.header h1 {
font-size: 1.1rem;
font-weight: 600;
letter-spacing: -0.01em;
}
.connection-status {
display: flex;
align-items: center;
gap: 0.5rem;
font-size: 0.8rem;
color: #666;
}
.dot {
width: 8px;
height: 8px;
border-radius: 50%;
background: #555;
}
.dot.connected {
background: #10b981;
box-shadow: 0 0 6px #10b981;
}
.error-banner {
padding: 0.5rem 1.25rem;
background: #7f1d1d;
color: #fca5a5;
font-size: 0.85rem;
}
.layout {
display: flex;
flex: 1;
overflow: hidden;
}
.sidebar {
width: 300px;
background: #141414;
border-right: 1px solid #2a2a2a;
overflow-y: auto;
}
.main {
flex: 1;
overflow-y: auto;
padding: 1rem;
display: flex;
flex-direction: column;
gap: 1rem;
}
.main-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 1rem;
}
.main-left,
.main-right {
display: flex;
flex-direction: column;
gap: 1rem;
}
/* ---- Panel shared ---- */
.panel-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 0.75rem;
}
.panel-header h2 {
font-size: 0.85rem;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #888;
}
.badge-row {
display: flex;
gap: 0.25rem;
}
/* ---- Topic Badge ---- */
.topic-badge {
display: inline-flex;
align-items: center;
gap: 0.25rem;
padding: 0.15rem 0.5rem;
font-size: 0.65rem;
background: #1e293b;
border: 1px solid #334155;
border-radius: 12px;
cursor: pointer;
transition: all 0.2s;
flex-shrink: 0;
}
.topic-badge:hover {
border-color: #3b82f6;
}
.topic-badge.expanded {
flex-direction: column;
align-items: flex-start;
border-radius: 8px;
padding: 0.5rem;
position: relative;
z-index: 10;
background: #1e293b;
}
.topic-number {
color: #3b82f6;
font-weight: 700;
}
.topic-title {
color: #94a3b8;
}
.topic-detail {
margin-top: 0.25rem;
font-size: 0.7rem;
line-height: 1.4;
}
.topic-detail p {
color: #cbd5e1;
margin-bottom: 0.25rem;
}
.topic-detail code {
color: #10b981;
font-size: 0.65rem;
}
/* ---- Asset List ---- */
.scan-button {
padding: 0.25rem 0.5rem;
font-size: 0.7rem;
background: #1e293b;
color: #94a3b8;
border: 1px solid #334155;
border-radius: 4px;
cursor: pointer;
transition: all 0.2s;
}
.scan-button:hover:not(:disabled) {
background: #334155;
color: #e0e0e0;
}
.scan-button:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.asset-list {
list-style: none;
max-height: 200px;
overflow-y: auto;
margin-bottom: 0.75rem;
}
.asset-item {
padding: 0.4rem 0.5rem;
cursor: pointer;
border-left: 2px solid transparent;
transition: all 0.15s;
display: flex;
flex-direction: column;
gap: 0.1rem;
}
.asset-item:hover {
background: #1a1a1a;
}
.asset-item.selected {
background: #1e293b;
border-left-color: #3b82f6;
}
.asset-filename {
font-size: 0.8rem;
color: #e0e0e0;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
.asset-meta {
font-size: 0.65rem;
color: #555;
}
.asset-empty {
font-size: 0.8rem;
color: #444;
padding: 0.75rem 0.5rem;
text-align: center;
}
.selected-asset-info {
padding: 0.5rem;
background: #1e293b;
border: 1px solid #334155;
border-radius: 4px;
margin-bottom: 0.75rem;
}
.asset-detail {
display: block;
font-size: 0.8rem;
color: #e0e0e0;
font-weight: 500;
}
.asset-detail-meta {
display: block;
font-size: 0.65rem;
color: #64748b;
margin-top: 0.15rem;
}
/* ---- Config Panel ---- */
.config-panel {
padding: 1rem;
}
.config-field {
margin-bottom: 0.75rem;
}
.config-field label {
display: block;
font-size: 0.75rem;
color: #888;
margin-bottom: 0.25rem;
}
.config-field .default {
color: #555;
font-style: italic;
}
.config-field input,
.config-field select {
width: 100%;
padding: 0.4rem 0.5rem;
font-size: 0.8rem;
background: #222;
color: #e0e0e0;
border: 1px solid #333;
border-radius: 4px;
}
.config-field input:focus,
.config-field select:focus {
outline: none;
border-color: #3b82f6;
}
.start-button {
width: 100%;
padding: 0.5rem;
font-size: 0.85rem;
background: #10b981;
color: #000;
border: none;
border-radius: 4px;
cursor: pointer;
font-weight: 600;
margin-top: 0.5rem;
transition: background 0.2s;
}
.start-button:hover:not(:disabled) {
background: #059669;
}
.start-button:disabled {
background: #333;
color: #666;
cursor: not-allowed;
}
/* ---- Pipeline Diagram ---- */
.pipeline-diagram {
background: #141414;
border: 1px solid #2a2a2a;
border-radius: 8px;
padding: 1rem;
}
.stage-flow {
display: flex;
align-items: center;
gap: 0;
overflow-x: auto;
}
.stage-wrapper {
display: flex;
align-items: center;
}
.stage {
padding: 0.5rem 0.75rem;
background: #1a1a1a;
border: 1px solid #333;
border-radius: 6px;
text-align: center;
min-width: 120px;
transition: all 0.3s;
}
.stage.active {
border-color: #3b82f6;
background: #1e293b;
box-shadow: 0 0 12px rgba(59, 130, 246, 0.2);
}
.stage-label {
font-size: 0.8rem;
font-weight: 600;
color: #e0e0e0;
}
.stage-sub {
font-size: 0.65rem;
color: #666;
margin-top: 0.15rem;
}
.stage-arrow {
width: 24px;
height: 2px;
background: #444;
position: relative;
}
.stage-arrow::after {
content: "";
position: absolute;
right: 0;
top: -3px;
border: 4px solid transparent;
border-left: 6px solid #444;
}
.processor-hierarchy {
margin-top: 0.75rem;
padding-top: 0.75rem;
border-top: 1px solid #222;
}
.hierarchy-title {
font-size: 0.7rem;
color: #666;
margin-bottom: 0.35rem;
font-style: italic;
}
.hierarchy-children {
display: flex;
gap: 0.5rem;
flex-wrap: wrap;
}
.hierarchy-node {
font-size: 0.7rem;
padding: 0.15rem 0.5rem;
background: #1a1a1a;
border: 1px solid #333;
border-radius: 4px;
color: #94a3b8;
}
/* ---- Chunk Grid ---- */
.chunk-grid-panel {
background: #141414;
border: 1px solid #2a2a2a;
border-radius: 8px;
padding: 1rem;
}
.chunk-count {
font-size: 0.7rem;
color: #555;
font-weight: 400;
}
.chunk-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(32px, 1fr));
gap: 3px;
max-height: 200px;
overflow-y: auto;
}
.chunk-cell {
aspect-ratio: 1;
display: flex;
align-items: center;
justify-content: center;
font-size: 0.55rem;
color: rgba(255, 255, 255, 0.6);
border-radius: 3px;
transition: background 0.3s;
}
.chunk-legend {
display: flex;
gap: 0.75rem;
margin-top: 0.5rem;
flex-wrap: wrap;
}
.legend-item {
display: flex;
align-items: center;
gap: 0.25rem;
font-size: 0.65rem;
color: #888;
}
.legend-dot {
width: 8px;
height: 8px;
border-radius: 2px;
}
/* ---- Worker Panel ---- */
.worker-panel {
background: #141414;
border: 1px solid #2a2a2a;
border-radius: 8px;
padding: 1rem;
}
.worker-cards {
display: flex;
flex-direction: column;
gap: 0.5rem;
}
.worker-card {
padding: 0.5rem 0.75rem;
background: #1a1a1a;
border: 1px solid #2a2a2a;
border-radius: 6px;
}
.worker-header {
display: flex;
justify-content: space-between;
align-items: center;
}
.worker-name {
font-size: 0.8rem;
font-weight: 500;
}
.worker-state {
font-size: 0.7rem;
text-transform: uppercase;
font-weight: 600;
}
.worker-chunk {
font-size: 0.7rem;
color: #555;
margin-top: 0.15rem;
}
.worker-stats {
display: flex;
gap: 0.75rem;
font-size: 0.65rem;
color: #555;
margin-top: 0.25rem;
}
.worker-empty {
font-size: 0.8rem;
color: #444;
text-align: center;
padding: 1rem;
}
/* ---- Queue Gauge ---- */
.queue-gauge {
background: #141414;
border: 1px solid #2a2a2a;
border-radius: 8px;
padding: 1rem;
}
.gauge-row {
margin-bottom: 0.5rem;
}
.gauge-label {
font-size: 0.75rem;
color: #888;
margin-bottom: 0.25rem;
}
.gauge-value {
color: #e0e0e0;
font-weight: 600;
}
.gauge-bar {
height: 8px;
background: #222;
border-radius: 4px;
overflow: hidden;
}
.gauge-fill {
height: 100%;
border-radius: 4px;
transition: width 0.3s, background 0.3s;
}
.gauge-note {
font-size: 0.65rem;
color: #555;
}
/* ---- Stats Panel ---- */
.stats-panel {
background: #141414;
border: 1px solid #2a2a2a;
border-radius: 8px;
padding: 1rem;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 0.5rem;
}
.stat {
text-align: center;
padding: 0.5rem;
background: #1a1a1a;
border-radius: 6px;
}
.stat-value {
font-size: 1.1rem;
font-weight: 700;
color: #e0e0e0;
}
.stat-label {
font-size: 0.6rem;
color: #666;
text-transform: uppercase;
letter-spacing: 0.05em;
margin-top: 0.15rem;
}
.test-info {
margin-top: 0.75rem;
padding-top: 0.5rem;
border-top: 1px solid #222;
display: flex;
align-items: center;
gap: 0.5rem;
}
.test-badge {
font-size: 0.65rem;
padding: 0.15rem 0.4rem;
background: #10b981;
color: #000;
border-radius: 3px;
font-weight: 600;
}
.test-note {
font-size: 0.65rem;
color: #555;
}
/* ---- Error Log ---- */
.error-log {
background: #141414;
border: 1px solid #2a2a2a;
border-radius: 8px;
padding: 1rem;
}
.error-count {
font-size: 0.7rem;
background: #7f1d1d;
color: #fca5a5;
padding: 0.1rem 0.4rem;
border-radius: 8px;
font-weight: 400;
}
.exception-tree {
margin-bottom: 0.75rem;
padding: 0.5rem;
background: #1a1a1a;
border-radius: 6px;
font-size: 0.7rem;
font-family: "Fira Code", monospace;
}
.tree-node {
color: #94a3b8;
padding: 0.1rem 0;
}
.tree-node.root {
color: #f59e0b;
font-weight: 600;
}
.tree-node.leaf {
color: #64748b;
}
.tree-children {
padding-left: 1rem;
border-left: 1px solid #333;
margin-left: 0.5rem;
}
.tree-grandchildren {
padding-left: 1rem;
border-left: 1px solid #333;
margin-left: 0.5rem;
}
.error-entries {
max-height: 150px;
overflow-y: auto;
}
.error-empty {
font-size: 0.8rem;
color: #444;
text-align: center;
padding: 0.5rem;
}
.error-entry {
display: flex;
gap: 0.5rem;
align-items: center;
padding: 0.35rem 0;
border-bottom: 1px solid #1a1a1a;
font-size: 0.7rem;
flex-wrap: wrap;
}
.error-type {
color: #ef4444;
font-weight: 500;
}
.error-seq {
color: #f59e0b;
}
.error-worker {
color: #3b82f6;
}
.error-msg {
color: #888;
flex: 1;
}
.error-retries {
color: #f97316;
font-size: 0.65rem;
}

245
ui/chunker/src/App.tsx Normal file
View File

@@ -0,0 +1,245 @@
import { useCallback, useEffect, useMemo, useState } from "react";
import "./App.css";
import { createChunkJob, getAssets, scanMediaFolder } from "./api";
import { ChunkGrid } from "./components/ChunkGrid";
import { ConfigPanel } from "./components/ConfigPanel";
import { ErrorLog } from "./components/ErrorLog";
import { PipelineDiagram } from "./components/PipelineDiagram";
import { QueueGauge } from "./components/QueueGauge";
import { StatsPanel } from "./components/StatsPanel";
import { WorkerPanel } from "./components/WorkerPanel";
import { useEventStream } from "./hooks/useEventStream";
import type {
ChunkInfo,
ErrorEntry,
MediaAsset,
PipelineConfig,
PipelineStats,
WorkerInfo,
} from "./types";
export default function App() {
const [jobId, setJobId] = useState<string | null>(null);
const [running, setRunning] = useState(false);
const [error, setError] = useState<string | null>(null);
// Asset state
const [assets, setAssets] = useState<MediaAsset[]>([]);
const [selectedAsset, setSelectedAsset] = useState<MediaAsset | null>(null);
const [scanning, setScanning] = useState(false);
const { events, connected, done } = useEventStream(jobId);
// Load assets on mount
useEffect(() => {
getAssets()
.then((data) => setAssets(data.sort((a, b) => a.filename.localeCompare(b.filename))))
.catch((e) => setError(e instanceof Error ? e.message : "Failed to load assets"));
}, []);
const handleScan = useCallback(async () => {
setScanning(true);
setError(null);
try {
await scanMediaFolder();
const data = await getAssets();
setAssets(data.sort((a, b) => a.filename.localeCompare(b.filename)));
} catch (e) {
setError(e instanceof Error ? e.message : "Scan failed");
} finally {
setScanning(false);
}
}, []);
// Derive state from events
const { chunks, workers, stats, errors, activeStage, queueSize } =
useMemo(() => {
const chunkMap = new Map<number, ChunkInfo>();
const workerMap = new Map<string, WorkerInfo>();
const errorList: ErrorEntry[] = [];
let totalChunks = 0;
let processed = 0;
let failed = 0;
let retries = 0;
let elapsed = 0;
let throughput = 0;
let queueSize = 0;
let stage = "pending";
for (const evt of events) {
if (evt.total_chunks) totalChunks = evt.total_chunks;
if (evt.processed_chunks) processed = evt.processed_chunks;
if (evt.failed_chunks) failed = evt.failed_chunks;
if (evt.elapsed) elapsed = evt.elapsed;
if (evt.throughput_mbps) throughput = evt.throughput_mbps;
if (evt.queue_size !== undefined) queueSize = evt.queue_size;
if (evt.status && evt.status !== "waiting") stage = evt.status;
// Track chunks
if (evt.sequence !== undefined) {
const existing = chunkMap.get(evt.sequence) || {
sequence: evt.sequence,
state: "pending" as const,
};
if (evt.status === "chunking" || evt.status === "pending") {
existing.state = "queued";
} else if (evt.status === "processing") {
existing.state = "processing";
if (evt.worker_id) existing.worker_id = evt.worker_id;
} else if (evt.status === "completed") {
existing.state = "done";
if (evt.processing_time)
existing.processing_time = evt.processing_time;
if (evt.retries) existing.retries = evt.retries;
} else if (evt.status === "failed") {
existing.state = "error";
if (evt.error) existing.error = evt.error;
}
if (evt.size) existing.size = evt.size;
chunkMap.set(evt.sequence, existing);
}
// Track workers
if (evt.worker_id) {
const w = workerMap.get(evt.worker_id) || {
worker_id: evt.worker_id,
state: "idle" as const,
processed: 0,
errors: 0,
retries: 0,
};
if (evt.state === "processing") {
w.state = "processing";
w.current_chunk = evt.sequence;
} else if (evt.state === "idle") {
w.state = "idle";
w.current_chunk = undefined;
} else if (evt.state === "stopped") {
w.state = "stopped";
}
if (evt.success !== undefined) {
if (evt.success) w.processed++;
else w.errors++;
}
if (evt.retries) {
retries += evt.retries;
w.retries += evt.retries;
}
workerMap.set(evt.worker_id, w);
}
// Track errors
if (evt.error) {
errorList.push({
timestamp: Date.now(),
sequence: evt.sequence,
worker_id: evt.worker_id,
error: evt.error,
retries: evt.retries,
event_type: evt.status || "error",
});
}
}
const statsObj: PipelineStats = {
total_chunks: totalChunks,
processed,
failed,
retries,
elapsed,
throughput_mbps: throughput,
queue_size: queueSize,
};
return {
chunks: Array.from(chunkMap.values()).sort(
(a, b) => a.sequence - b.sequence
),
workers: Array.from(workerMap.values()),
stats: statsObj,
errors: errorList,
activeStage: stage,
queueSize,
};
}, [events]);
const handleStart = useCallback(async (config: PipelineConfig) => {
setError(null);
setRunning(true);
try {
const result = await createChunkJob(config);
setJobId(result.id);
} catch (e) {
setError(e instanceof Error ? e.message : "Failed to start");
setRunning(false);
}
}, []);
// Reset running state when done
if (done && running) {
setRunning(false);
}
return (
<div className="app">
<header className="header">
<h1>MPR Chunker Pipeline</h1>
<div className="connection-status">
{jobId && (
<span className={`dot ${connected ? "connected" : ""}`} />
)}
<span className="status-text">
{!jobId
? "Configure and launch"
: connected
? "Streaming"
: done
? "Complete"
: "Connecting..."}
</span>
</div>
</header>
{error && <div className="error-banner">{error}</div>}
<div className="layout">
<aside className="sidebar">
<ConfigPanel
onStart={handleStart}
running={running}
assets={assets}
selectedAsset={selectedAsset}
onSelectAsset={setSelectedAsset}
onScan={handleScan}
scanning={scanning}
/>
</aside>
<main className="main">
<PipelineDiagram activeStage={activeStage} />
<div className="main-grid">
<div className="main-left">
<ChunkGrid chunks={chunks} totalChunks={stats.total_chunks} />
<QueueGauge
current={queueSize}
max={10}
buffered={0}
/>
</div>
<div className="main-right">
<WorkerPanel workers={workers} />
<StatsPanel stats={stats} />
<ErrorLog errors={errors} />
</div>
</div>
</main>
</div>
</div>
);
}

72
ui/chunker/src/api.ts Normal file
View File

@@ -0,0 +1,72 @@
/**
* GraphQL API client for the chunker UI.
*/
import type { MediaAsset } from "./types";
const GRAPHQL_URL = "/api/graphql";
async function gql<T>(query: string, variables?: Record<string, unknown>): Promise<T> {
const response = await fetch(GRAPHQL_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ query, variables }),
});
const json = await response.json();
if (json.errors?.length) {
throw new Error(json.errors[0].message);
}
return json.data as T;
}
/** Fetch all media assets. */
export async function getAssets(): Promise<MediaAsset[]> {
const data = await gql<{ assets: MediaAsset[] }>(`
query {
assets {
id filename file_path status error_message file_size duration
video_codec audio_codec width height framerate bitrate
properties comments tags created_at updated_at
}
}
`);
return data.assets;
}
/** Scan media/in/ folder for new files. */
export async function scanMediaFolder(): Promise<{
found: number;
registered: number;
skipped: number;
files: string[];
}> {
const data = await gql<{ scan_media_folder: { found: number; registered: number; skipped: number; files: string[] } }>(`
mutation {
scan_media_folder { found registered skipped files }
}
`);
return data.scan_media_folder;
}
/** Create a chunk job via GraphQL mutation. */
export async function createChunkJob(config: {
source_asset_id: string;
chunk_duration: number;
num_workers: number;
max_retries: number;
processor_type: string;
}): Promise<{ id: string }> {
const data = await gql<{ create_chunk_job: { id: string; status: string } }>(`
mutation CreateChunkJob($input: CreateChunkJobInput!) {
create_chunk_job(input: $input) {
id
status
}
}
`, { input: config });
return data.create_chunk_job;
}

View File

@@ -0,0 +1,59 @@
import type { ChunkInfo } from "../types";
import { TopicBadge, TOPICS } from "./TopicBadge";
interface Props {
chunks: ChunkInfo[];
totalChunks: number;
}
const STATE_COLORS: Record<string, string> = {
pending: "#333",
queued: "#f59e0b",
processing: "#3b82f6",
done: "#10b981",
error: "#ef4444",
retry: "#f97316",
};
/**
* Grid of chunks colored by processing state.
* Chunks appear incrementally as the generator yields them.
* Interview Topic 3: Generators & iteration.
*/
export function ChunkGrid({ chunks, totalChunks }: Props) {
return (
<div className="chunk-grid-panel">
<div className="panel-header">
<h2>
Chunks{" "}
<span className="chunk-count">
{chunks.length} / {totalChunks || "?"}
</span>
</h2>
<TopicBadge topic={TOPICS.iteration} />
</div>
<div className="chunk-grid">
{chunks.map((chunk) => (
<div
key={chunk.sequence}
className="chunk-cell"
style={{ background: STATE_COLORS[chunk.state] || "#333" }}
title={`#${chunk.sequence}${chunk.state}${
chunk.worker_id ? ` (${chunk.worker_id})` : ""
}${chunk.retries ? ` retries: ${chunk.retries}` : ""}`}
>
{chunk.sequence}
</div>
))}
</div>
<div className="chunk-legend">
{Object.entries(STATE_COLORS).map(([state, color]) => (
<span key={state} className="legend-item">
<span className="legend-dot" style={{ background: color }} />
{state}
</span>
))}
</div>
</div>
);
}

View File

@@ -0,0 +1,172 @@
import { useState } from "react";
import type { MediaAsset, PipelineConfig } from "../types";
import { TopicBadge, TOPICS } from "./TopicBadge";
interface Props {
onStart: (config: PipelineConfig) => void;
running: boolean;
assets: MediaAsset[];
selectedAsset: MediaAsset | null;
onSelectAsset: (asset: MediaAsset) => void;
onScan: () => void;
scanning: boolean;
}
/**
* Pipeline configuration form with file browser.
* Each parameter shows its default — Interview Topic 1: Function params & defaults.
*/
export function ConfigPanel({
onStart,
running,
assets,
selectedAsset,
onSelectAsset,
onScan,
scanning,
}: Props) {
const [chunkDuration, setChunkDuration] = useState(10.0);
const [numWorkers, setNumWorkers] = useState(4);
const [maxRetries, setMaxRetries] = useState(3);
const [processorType, setProcessorType] = useState<
"ffmpeg" | "checksum" | "simulated_decode" | "composite"
>("ffmpeg");
const handleSubmit = (e: React.FormEvent) => {
e.preventDefault();
if (!selectedAsset) return;
onStart({
source_asset_id: selectedAsset.id,
chunk_duration: chunkDuration,
num_workers: numWorkers,
max_retries: maxRetries,
processor_type: processorType,
});
};
const formatSize = (bytes: number | null) => {
if (!bytes) return "—";
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
};
const formatDuration = (seconds: number | null) => {
if (!seconds) return "—";
const m = Math.floor(seconds / 60);
const s = Math.floor(seconds % 60);
return `${m}:${s.toString().padStart(2, "0")}`;
};
return (
<div className="config-panel">
{/* Asset Browser */}
<div className="panel-header">
<h2>Assets</h2>
<button
onClick={onScan}
disabled={scanning}
className="scan-button"
>
{scanning ? "Scanning..." : "Scan Folder"}
</button>
</div>
<ul className="asset-list">
{assets.length === 0 ? (
<li className="asset-empty">No assets click Scan Folder</li>
) : (
assets.map((asset) => (
<li
key={asset.id}
className={`asset-item ${selectedAsset?.id === asset.id ? "selected" : ""}`}
onClick={() => onSelectAsset(asset)}
title={asset.filename}
>
<span className="asset-filename">{asset.filename}</span>
<span className="asset-meta">
{formatSize(asset.file_size)} · {formatDuration(asset.duration)}
</span>
</li>
))
)}
</ul>
{selectedAsset && (
<div className="selected-asset-info">
<span className="asset-detail">{selectedAsset.filename}</span>
<span className="asset-detail-meta">
{selectedAsset.video_codec} · {selectedAsset.width}x{selectedAsset.height} · {formatDuration(selectedAsset.duration)}
</span>
</div>
)}
{/* Pipeline Config */}
<div className="panel-header" style={{ marginTop: "1rem" }}>
<h2>Pipeline Config</h2>
<TopicBadge topic={TOPICS.params} />
</div>
<form onSubmit={handleSubmit}>
<div className="config-field">
<label>
Chunk Duration <span className="default">default: 10s</span>
</label>
<select
value={chunkDuration}
onChange={(e) => setChunkDuration(Number(e.target.value))}
>
<option value={5}>5 seconds</option>
<option value={10}>10 seconds</option>
<option value={15}>15 seconds</option>
<option value={30}>30 seconds</option>
<option value={60}>60 seconds</option>
</select>
</div>
<div className="config-field">
<label>
Workers <span className="default">default: 4</span>
</label>
<input
type="number"
min={1}
max={16}
value={numWorkers}
onChange={(e) => setNumWorkers(Number(e.target.value))}
/>
</div>
<div className="config-field">
<label>
Max Retries <span className="default">default: 3</span>
</label>
<input
type="number"
min={0}
max={10}
value={maxRetries}
onChange={(e) => setMaxRetries(Number(e.target.value))}
/>
</div>
<div className="config-field">
<label>
Processor <span className="default">default: ffmpeg</span>
</label>
<select
value={processorType}
onChange={(e) =>
setProcessorType(
e.target.value as "ffmpeg" | "checksum" | "simulated_decode" | "composite"
)
}
>
<option value="ffmpeg">FFmpegExtractProcessor</option>
<option value="checksum">ChecksumProcessor</option>
<option value="simulated_decode">SimulatedDecodeProcessor</option>
<option value="composite">CompositeProcessor</option>
</select>
</div>
<button type="submit" className="start-button" disabled={running || !selectedAsset}>
{running ? "Running..." : "Launch Pipeline"}
</button>
</form>
</div>
);
}

View File

@@ -0,0 +1,63 @@
import type { ErrorEntry } from "../types";
import { TopicBadge, TOPICS } from "./TopicBadge";
interface Props {
errors: ErrorEntry[];
}
/**
* Error and retry event log.
* Shows exception types, retry counts, backoff delays.
* Interview Topic 7: Exception handling & resilient code.
*/
export function ErrorLog({ errors }: Props) {
return (
<div className="error-log">
<div className="panel-header">
<h2>
Errors & Retries{" "}
<span className="error-count">{errors.length}</span>
</h2>
<TopicBadge topic={TOPICS.exceptions} />
</div>
<div className="exception-tree">
<div className="tree-node root">PipelineError</div>
<div className="tree-children">
<div className="tree-node">ChunkError</div>
<div className="tree-grandchildren">
<div className="tree-node leaf">ChunkReadError</div>
<div className="tree-node leaf">ChunkChecksumError</div>
</div>
<div className="tree-node">ProcessingError</div>
<div className="tree-grandchildren">
<div className="tree-node leaf">ProcessorTimeoutError</div>
<div className="tree-node leaf">ProcessorFailureError</div>
</div>
<div className="tree-node">ReassemblyError</div>
</div>
</div>
<div className="error-entries">
{errors.length === 0 && (
<div className="error-empty">No errors recorded</div>
)}
{errors.map((entry, i) => (
<div key={i} className="error-entry">
<span className="error-type">{entry.event_type}</span>
{entry.sequence !== undefined && (
<span className="error-seq">chunk #{entry.sequence}</span>
)}
{entry.worker_id && (
<span className="error-worker">{entry.worker_id}</span>
)}
<span className="error-msg">{entry.error}</span>
{entry.retries !== undefined && entry.retries > 0 && (
<span className="error-retries">
{entry.retries} retries
</span>
)}
</div>
))}
</div>
</div>
);
}

View File

@@ -0,0 +1,50 @@
import { TopicBadge, TOPICS } from "./TopicBadge";
interface Props {
activeStage: string;
}
const STAGES = [
{ id: "chunking", label: "Chunker", sub: "File -> Chunks (generator)" },
{ id: "queued", label: "ChunkQueue", sub: "Bounded queue (backpressure)" },
{ id: "processing", label: "WorkerPool", sub: "ThreadPoolExecutor" },
{ id: "collecting", label: "ResultCollector", sub: "heapq reassembly" },
{ id: "completed", label: "PipelineResult", sub: "Aggregate stats" },
];
/**
* Visual flow diagram of pipeline stages.
* Highlights the currently active stage.
* Interview Topic 4: OOP design — shows class hierarchy.
*/
export function PipelineDiagram({ activeStage }: Props) {
return (
<div className="pipeline-diagram">
<div className="panel-header">
<h2>Pipeline Flow</h2>
<TopicBadge topic={TOPICS.oop} />
</div>
<div className="stage-flow">
{STAGES.map((stage, i) => (
<div key={stage.id} className="stage-wrapper">
<div
className={`stage ${activeStage === stage.id ? "active" : ""}`}
>
<div className="stage-label">{stage.label}</div>
<div className="stage-sub">{stage.sub}</div>
</div>
{i < STAGES.length - 1 && <div className="stage-arrow" />}
</div>
))}
</div>
<div className="processor-hierarchy">
<div className="hierarchy-title">Processor ABC</div>
<div className="hierarchy-children">
<span className="hierarchy-node">ChecksumProcessor</span>
<span className="hierarchy-node">SimulatedDecodeProcessor</span>
<span className="hierarchy-node">CompositeProcessor</span>
</div>
</div>
</div>
);
}

View File

@@ -0,0 +1,46 @@
import { TopicBadge, TOPICS } from "./TopicBadge";
interface Props {
current: number;
max: number;
buffered: number;
}
/**
* Queue fill level gauge + collector heap buffer.
* Interview Topic 5: Data structures — queue.Queue, heapq, deque.
*/
export function QueueGauge({ current, max, buffered }: Props) {
const fillPct = max > 0 ? Math.min((current / max) * 100, 100) : 0;
return (
<div className="queue-gauge">
<div className="panel-header">
<h2>Queue & Buffer</h2>
<TopicBadge topic={TOPICS.datastructures} />
</div>
<div className="gauge-row">
<div className="gauge-label">
Queue <span className="gauge-value">{current}/{max}</span>
</div>
<div className="gauge-bar">
<div
className="gauge-fill"
style={{
width: `${fillPct}%`,
background: fillPct > 80 ? "#ef4444" : "#3b82f6",
}}
/>
</div>
</div>
<div className="gauge-row">
<div className="gauge-label">
Heap Buffer <span className="gauge-value">{buffered}</span>
</div>
<div className="gauge-note">
Out-of-order results waiting for gaps to fill
</div>
</div>
</div>
);
}

View File

@@ -0,0 +1,59 @@
import type { PipelineStats } from "../types";
import { TopicBadge, TOPICS } from "./TopicBadge";
interface Props {
stats: PipelineStats;
}
/**
* Throughput, timing, and error stats.
* Interview Topic 6: Algorithms — throughput calculation over sliding window.
* Interview Topic 8: TDD — test count and coverage.
*/
export function StatsPanel({ stats }: Props) {
return (
<div className="stats-panel">
<div className="panel-header">
<h2>Stats</h2>
<div className="badge-row">
<TopicBadge topic={TOPICS.algorithms} />
<TopicBadge topic={TOPICS.testing} />
</div>
</div>
<div className="stats-grid">
<div className="stat">
<div className="stat-value">{stats.total_chunks}</div>
<div className="stat-label">Total Chunks</div>
</div>
<div className="stat">
<div className="stat-value">{stats.processed}</div>
<div className="stat-label">Processed</div>
</div>
<div className="stat">
<div className="stat-value">{stats.failed}</div>
<div className="stat-label">Failed</div>
</div>
<div className="stat">
<div className="stat-value">{stats.retries}</div>
<div className="stat-label">Retries</div>
</div>
<div className="stat">
<div className="stat-value">
{stats.throughput_mbps.toFixed(2)}
</div>
<div className="stat-label">MB/s</div>
</div>
<div className="stat">
<div className="stat-value">{stats.elapsed.toFixed(2)}s</div>
<div className="stat-label">Elapsed</div>
</div>
</div>
<div className="test-info">
<span className="test-badge">64 tests</span>
<span className="test-note">
7 test files &middot; pytest &middot; parametrized
</span>
</div>
</div>
);
}

View File

@@ -0,0 +1,86 @@
import { useState } from "react";
import type { InterviewTopic } from "../types";
/**
* Expandable pill badge annotating an interview topic.
* Click to expand and see description + code reference.
*/
export function TopicBadge({ topic }: { topic: InterviewTopic }) {
const [expanded, setExpanded] = useState(false);
return (
<div
className={`topic-badge ${expanded ? "expanded" : ""}`}
onClick={() => setExpanded(!expanded)}
>
<span className="topic-number">#{topic.number}</span>
<span className="topic-title">{topic.title}</span>
{expanded && (
<div className="topic-detail">
<p>{topic.description}</p>
<code>{topic.code_ref}</code>
</div>
)}
</div>
);
}
/** Pre-defined topics mapped to pipeline components. */
export const TOPICS: Record<string, InterviewTopic> = {
params: {
number: 1,
title: "Function Params & Defaults",
description:
"Each pipeline parameter has a sensible default (chunk_duration=10s, num_workers=4, max_retries=3). Tweaking them changes pipeline behavior.",
code_ref: "core/chunker/pipeline.py — Pipeline.__init__()",
},
concurrency: {
number: 2,
title: "Concurrency (Threading)",
description:
"Workers run in a ThreadPoolExecutor. The queue coordinates work between producer and consumer threads.",
code_ref: "core/chunker/pool.py — WorkerPool, ThreadPoolExecutor",
},
iteration: {
number: 3,
title: "Generators & Iteration",
description:
"Chunks are yielded lazily via a generator — the file is never fully loaded into memory.",
code_ref: "core/chunker/chunker.py — Chunker.chunks() generator",
},
oop: {
number: 4,
title: "OOP Design (ABC)",
description:
"Processor is an abstract base class. ChecksumProcessor, SimulatedDecodeProcessor, and CompositeProcessor inherit from it.",
code_ref: "core/chunker/processor.py — Processor ABC hierarchy",
},
datastructures: {
number: 5,
title: "Data Structures",
description:
"Bounded queue.Queue for backpressure, heapq min-heap for ordered reassembly, deque for sliding-window throughput.",
code_ref: "core/chunker/queue.py, collector.py, models.py",
},
algorithms: {
number: 6,
title: "Algorithms & Sorting",
description:
"ResultCollector uses a min-heap to reassemble chunks in sequence order, even when they arrive out of order.",
code_ref: "core/chunker/collector.py — heapq-based reassembly",
},
exceptions: {
number: 7,
title: "Exception Handling",
description:
"PipelineError hierarchy with typed exceptions. Workers retry with exponential backoff before giving up.",
code_ref: "core/chunker/exceptions.py, worker.py — retry logic",
},
testing: {
number: 8,
title: "TDD & Unit Testing",
description:
"64 tests covering every module. Parametrized tests, fixtures, edge cases, concurrency tests.",
code_ref: "tests/chunker/ — 7 test files, pytest",
},
};

View File

@@ -0,0 +1,55 @@
import type { WorkerInfo } from "../types";
import { TopicBadge, TOPICS } from "./TopicBadge";
interface Props {
workers: WorkerInfo[];
}
const STATE_COLORS: Record<string, string> = {
idle: "#6b7280",
processing: "#3b82f6",
retry: "#f97316",
stopped: "#ef4444",
};
/**
* Worker thread status cards.
* Shows each worker's real-time state and which chunk it's processing.
* Interview Topic 2: Concurrency (threading).
*/
export function WorkerPanel({ workers }: Props) {
return (
<div className="worker-panel">
<div className="panel-header">
<h2>Workers</h2>
<TopicBadge topic={TOPICS.concurrency} />
</div>
<div className="worker-cards">
{workers.map((w) => (
<div key={w.worker_id} className="worker-card">
<div className="worker-header">
<span className="worker-name">{w.worker_id}</span>
<span
className="worker-state"
style={{ color: STATE_COLORS[w.state] || "#888" }}
>
{w.state}
</span>
</div>
{w.current_chunk !== undefined && (
<div className="worker-chunk">chunk #{w.current_chunk}</div>
)}
<div className="worker-stats">
<span>done: {w.processed}</span>
<span>err: {w.errors}</span>
<span>retry: {w.retries}</span>
</div>
</div>
))}
{workers.length === 0 && (
<div className="worker-empty">No workers started</div>
)}
</div>
</div>
);
}

View File

@@ -0,0 +1,81 @@
import { useCallback, useEffect, useRef, useState } from "react";
import type { PipelineEvent } from "../types";
/**
* SSE hook — connects to /api/chunker/stream/{jobId} via native EventSource.
*
* Demonstrates: real-time event streaming from backend to UI.
*/
export function useEventStream(jobId: string | null) {
const [events, setEvents] = useState<PipelineEvent[]>([]);
const [connected, setConnected] = useState(false);
const [done, setDone] = useState(false);
const esRef = useRef<EventSource | null>(null);
const close = useCallback(() => {
if (esRef.current) {
esRef.current.close();
esRef.current = null;
setConnected(false);
}
}, []);
useEffect(() => {
if (!jobId) return;
setEvents([]);
setDone(false);
const es = new EventSource(`/api/chunker/stream/${jobId}`);
esRef.current = es;
es.onopen = () => setConnected(true);
es.onerror = () => setConnected(false);
const handleEvent = (eventType: string) => (e: MessageEvent) => {
try {
const data = JSON.parse(e.data) as PipelineEvent;
setEvents((prev) => [...prev, { ...data, status: eventType }]);
} catch {
// ignore parse errors
}
};
// Listen to all chunker event types
const eventTypes = [
"waiting",
"pending",
"chunking",
"processing",
"collecting",
"completed",
"failed",
"cancelled",
"done",
"timeout",
];
for (const type of eventTypes) {
es.addEventListener(type, handleEvent(type));
}
es.addEventListener("done", () => {
setDone(true);
es.close();
setConnected(false);
});
es.addEventListener("timeout", () => {
setDone(true);
es.close();
setConnected(false);
});
return () => {
es.close();
esRef.current = null;
};
}, [jobId]);
return { events, connected, done, close };
}

9
ui/chunker/src/main.tsx Normal file
View File

@@ -0,0 +1,9 @@
import React from "react";
import ReactDOM from "react-dom/client";
import App from "./App";
ReactDOM.createRoot(document.getElementById("app")!).render(
<React.StrictMode>
<App />
</React.StrictMode>
);

114
ui/chunker/src/types.ts Normal file
View File

@@ -0,0 +1,114 @@
/** Pipeline configuration sent to the backend. */
export interface PipelineConfig {
source_asset_id: string;
chunk_duration: number;
num_workers: number;
max_retries: number;
processor_type: "ffmpeg" | "checksum" | "simulated_decode" | "composite";
}
/** Media asset from the backend. */
export interface MediaAsset {
id: string;
filename: string;
file_path: string;
status: string;
error_message: string | null;
file_size: number | null;
duration: number | null;
video_codec: string | null;
audio_codec: string | null;
width: number | null;
height: number | null;
framerate: number | null;
bitrate: number | null;
properties: Record<string, unknown>;
comments: string;
tags: string[];
created_at: string | null;
updated_at: string | null;
}
/** State of an individual chunk. */
export type ChunkState =
| "pending"
| "queued"
| "processing"
| "done"
| "error"
| "retry";
/** Tracked chunk in the UI grid. */
export interface ChunkInfo {
sequence: number;
state: ChunkState;
size?: number;
worker_id?: string;
retries?: number;
processing_time?: number;
error?: string;
}
/** Worker thread status. */
export interface WorkerInfo {
worker_id: string;
state: "idle" | "processing" | "retry" | "stopped";
current_chunk?: number;
processed: number;
errors: number;
retries: number;
}
/** SSE event from the backend. */
export interface PipelineEvent {
job_id: string;
status?: string;
progress?: number;
total_chunks?: number;
processed_chunks?: number;
failed_chunks?: number;
throughput_mbps?: number;
elapsed?: number;
error?: string;
// Chunk-level fields
sequence?: number;
size?: number;
worker_id?: string;
success?: boolean;
processing_time?: number;
retries?: number;
queue_size?: number;
// Worker-level fields
state?: string;
attempt?: number;
backoff?: number;
}
/** Aggregate pipeline stats. */
export interface PipelineStats {
total_chunks: number;
processed: number;
failed: number;
retries: number;
elapsed: number;
throughput_mbps: number;
queue_size: number;
}
/** Error log entry. */
export interface ErrorEntry {
timestamp: number;
sequence?: number;
worker_id?: string;
error: string;
retries?: number;
event_type: string;
}
/** Interview topic for annotation badges. */
export interface InterviewTopic {
number: number;
title: string;
description: string;
code_ref: string;
}

1
ui/chunker/src/vite-env.d.ts vendored Normal file
View File

@@ -0,0 +1 @@
/// <reference types="vite/client" />

21
ui/chunker/tsconfig.json Normal file
View File

@@ -0,0 +1,21 @@
{
"compilerOptions": {
"target": "ES2020",
"useDefineForClassFields": true,
"module": "ESNext",
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"skipLibCheck": true,
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "react-jsx",
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true
},
"include": ["src/**/*.ts", "src/**/*.tsx"],
"references": [{ "path": "./tsconfig.node.json" }]
}

View File

@@ -0,0 +1,10 @@
{
"compilerOptions": {
"composite": true,
"skipLibCheck": true,
"module": "ESNext",
"moduleResolution": "bundler",
"allowSyntheticDefaultImports": true
},
"include": ["vite.config.ts"]
}

21
ui/chunker/vite.config.ts Normal file
View File

@@ -0,0 +1,21 @@
import { defineConfig } from "vite";
import react from "@vitejs/plugin-react";
export default defineConfig({
plugins: [react()],
server: {
host: "0.0.0.0",
port: 5174,
allowedHosts: process.env.VITE_ALLOWED_HOSTS?.split(",") || [],
proxy: {
"/api": {
target: "http://fastapi:8702",
changeOrigin: true,
},
"/graphql": {
target: "http://fastapi:8702",
changeOrigin: true,
},
},
},
});

View File

@@ -0,0 +1,2 @@
node_modules/
dist/

View File

@@ -6,6 +6,7 @@
export type AssetStatus = "pending" | "ready" | "error"; export type AssetStatus = "pending" | "ready" | "error";
export type JobStatus = "pending" | "processing" | "completed" | "failed" | "cancelled"; export type JobStatus = "pending" | "processing" | "completed" | "failed" | "cancelled";
export type ChunkJobStatus = "pending" | "chunking" | "processing" | "collecting" | "completed" | "failed" | "cancelled";
export interface MediaAsset { export interface MediaAsset {
id: string; id: string;
@@ -73,6 +74,29 @@ export interface TranscodeJob {
completed_at: string | null; completed_at: string | null;
} }
export interface ChunkJob {
id: string;
source_asset_id: string;
chunk_duration: number;
num_workers: number;
max_retries: number;
processor_type: string;
status: ChunkJobStatus;
progress: number;
total_chunks: number;
processed_chunks: number;
failed_chunks: number;
retry_count: number;
error_message: string | null;
throughput_mbps: number | null;
elapsed_seconds: number | null;
celery_task_id: string | null;
priority: number;
created_at: string | null;
started_at: string | null;
completed_at: string | null;
}
export interface CreateJobRequest { export interface CreateJobRequest {
source_asset_id: string; source_asset_id: string;
preset_id: string | null; preset_id: string | null;