chunker and ui

2026-03-13 14:29:38 -03:00
parent 3eeedebb15
commit ccc478fbaa
69 changed files with 6481 additions and 282 deletions
--- a/core/chunker/init.py
+++ b/core/chunker/init.py
@@ -0,0 +1,64 @@
+"""
+Chunker pipeline — splits files into chunks, processes concurrently, reassembles in order.
+
+Public API:
+    Pipeline      — orchestrates the full pipeline
+    PipelineResult — aggregate result dataclass
+    Chunker       — file → Chunk generator
+    ChunkQueue    — bounded thread-safe queue
+    WorkerPool    — manages N worker threads
+    ResultCollector — heapq-based ordered reassembly
+"""
+
+from .chunker import Chunker
+from .collector import ResultCollector
+from .exceptions import (
+    ChunkChecksumError,
+    ChunkError,
+    ChunkReadError,
+    PipelineError,
+    ProcessingError,
+    ProcessorFailureError,
+    ProcessorTimeoutError,
+    ReassemblyError,
+)
+from .models import Chunk, ChunkResult, PipelineResult
+from .pipeline import Pipeline
+from .pool import WorkerPool
+from .processor import (
+    ChecksumProcessor,
+    CompositeProcessor,
+    FFmpegExtractProcessor,
+    Processor,
+    SimulatedDecodeProcessor,
+)
+from .queue import ChunkQueue
+
+__all__ = [
+    # Core
+    "Pipeline",
+    "PipelineResult",
+    # Components
+    "Chunker",
+    "ChunkQueue",
+    "WorkerPool",
+    "ResultCollector",
+    # Models
+    "Chunk",
+    "ChunkResult",
+    # Processors
+    "Processor",
+    "ChecksumProcessor",
+    "SimulatedDecodeProcessor",
+    "CompositeProcessor",
+    "FFmpegExtractProcessor",
+    # Exceptions
+    "PipelineError",
+    "ChunkError",
+    "ChunkReadError",
+    "ChunkChecksumError",
+    "ProcessingError",
+    "ProcessorFailureError",
+    "ProcessorTimeoutError",
+    "ReassemblyError",
+]
--- a/core/chunker/chunker.py
+++ b/core/chunker/chunker.py
@@ -0,0 +1,86 @@
+"""
+Chunker — probes a media file and yields time-based Chunk objects.
+
+Demonstrates:
+- Function parameters and defaults (Interview Topic 1)
+- List comprehensions and efficient iteration / generators (Interview Topic 3)
+"""
+
+import math
+import os
+from typing import Generator
+
+from core.ffmpeg.probe import probe_file
+
+from .exceptions import ChunkReadError
+from .models import Chunk
+
+
+class Chunker:
+    """
+    Splits a media file into time-based chunks via a generator.
+
+    Uses FFmpeg probe to get duration, then yields Chunk objects
+    representing time segments (no data read — extraction happens in the processor).
+
+    Args:
+        file_path: Path to the source media file
+        chunk_duration: Duration of each chunk in seconds (default: 10.0)
+    """
+
+    def __init__(self, file_path: str, chunk_duration: float = 10.0):
+        if not os.path.isfile(file_path):
+            raise ChunkReadError(f"File not found: {file_path}")
+        if chunk_duration <= 0:
+            raise ValueError("chunk_duration must be positive")
+
+        self.file_path = file_path
+        self.chunk_duration = chunk_duration
+        self.file_size = os.path.getsize(file_path)
+        self.source_duration = self._probe_duration()
+
+    def _probe_duration(self) -> float:
+        """Get source file duration via FFmpeg probe."""
+        try:
+            result = probe_file(self.file_path)
+            if result.duration is None or result.duration <= 0:
+                raise ChunkReadError(
+                    f"Cannot determine duration for {self.file_path}"
+                )
+            return result.duration
+        except ChunkReadError:
+            raise
+        except Exception as e:
+            raise ChunkReadError(
+                f"Failed to probe {self.file_path}: {e}"
+            ) from e
+
+    @property
+    def expected_chunks(self) -> int:
+        """Calculate expected number of chunks (last chunk may be shorter)."""
+        if self.source_duration <= 0:
+            return 0
+        return math.ceil(self.source_duration / self.chunk_duration)
+
+    def chunks(self) -> Generator[Chunk, None, None]:
+        """
+        Yield Chunk objects representing time segments of the source file.
+
+        Generator-based: chunks are yielded on demand.
+        Each chunk defines a time range — actual extraction is done by the processor.
+        """
+        total = self.expected_chunks
+        for sequence in range(total):
+            start_time = sequence * self.chunk_duration
+            end_time = min(
+                start_time + self.chunk_duration, self.source_duration
+            )
+            duration = end_time - start_time
+
+            yield Chunk(
+                sequence=sequence,
+                start_time=start_time,
+                end_time=end_time,
+                source_path=self.file_path,
+                duration=duration,
+            )
--- a/core/chunker/collector.py
+++ b/core/chunker/collector.py
@@ -0,0 +1,98 @@
+"""
+ResultCollector — reassembles chunk results in sequence order using a min-heap.
+
+Demonstrates:
+- Algorithms and sorting (Interview Topic 6) — heapq for ordered reassembly
+- Core data structures (Interview Topic 5) — heap, deque
+"""
+
+import heapq
+from collections import deque
+from typing import List
+
+from .exceptions import ReassemblyError
+from .models import ChunkResult
+
+
+class ResultCollector:
+    """
+    Receives ChunkResults out of order, emits them in sequence order.
+
+    Uses a min-heap keyed on sequence number. Only emits a chunk when
+    all prior sequences have been accounted for.
+
+    Args:
+        total_chunks: Expected total number of chunks
+    """
+
+    def __init__(self, total_chunks: int):
+        self.total_chunks = total_chunks
+        self._heap: List[tuple[int, ChunkResult]] = []
+        self._next_sequence = 0
+        self._emitted: List[ChunkResult] = []
+        self._seen_sequences: set[int] = set()
+        # Sliding window for throughput calculation
+        self._recent_times: deque[float] = deque(maxlen=50)
+
+    def add(self, result: ChunkResult) -> List[ChunkResult]:
+        """
+        Add a result and return any newly emittable results in order.
+
+        Args:
+            result: A ChunkResult (may arrive out of order)
+
+        Returns:
+            List of results that can now be emitted in sequence order
+            (may be empty if we're still waiting for earlier sequences)
+
+        Raises:
+            ReassemblyError: If a duplicate sequence is received
+        """
+        if result.sequence in self._seen_sequences:
+            raise ReassemblyError(
+                f"Duplicate sequence number: {result.sequence}"
+            )
+        self._seen_sequences.add(result.sequence)
+
+        # Track processing time for throughput
+        if result.processing_time > 0:
+            self._recent_times.append(result.processing_time)
+
+        # Push to min-heap
+        heapq.heappush(self._heap, (result.sequence, result))
+
+        # Emit all consecutive results starting from _next_sequence
+        newly_emitted = []
+        while self._heap and self._heap[0][0] == self._next_sequence:
+            _, emitted_result = heapq.heappop(self._heap)
+            self._emitted.append(emitted_result)
+            newly_emitted.append(emitted_result)
+            self._next_sequence += 1
+
+        return newly_emitted
+
+    @property
+    def is_complete(self) -> bool:
+        """True if all expected chunks have been emitted in order."""
+        return self._next_sequence == self.total_chunks
+
+    @property
+    def buffered_count(self) -> int:
+        """Number of results waiting in the heap (arrived out of order)."""
+        return len(self._heap)
+
+    @property
+    def emitted_count(self) -> int:
+        """Number of results emitted in sequence order."""
+        return len(self._emitted)
+
+    @property
+    def avg_processing_time(self) -> float:
+        """Average processing time from recent results (sliding window)."""
+        if not self._recent_times:
+            return 0.0
+        return sum(self._recent_times) / len(self._recent_times)
+
+    def get_ordered_results(self) -> List[ChunkResult]:
+        """Get all emitted results in sequence order."""
+        return list(self._emitted)
--- a/core/chunker/exceptions.py
+++ b/core/chunker/exceptions.py
@@ -0,0 +1,64 @@
+"""
+Chunker exception hierarchy.
+
+Demonstrates: Managing exceptions and writing resilient code (Interview Topic 7).
+"""
+
+
+class PipelineError(Exception):
+    """Base exception for all chunker pipeline errors."""
+    pass
+
+
+class ChunkError(PipelineError):
+    """Errors related to chunk creation or validation."""
+    pass
+
+
+class ChunkReadError(ChunkError):
+    """Failed to read chunk data from source file."""
+    pass
+
+
+class ChunkChecksumError(ChunkError):
+    """Chunk data integrity validation failed."""
+
+    def __init__(self, sequence: int, expected: str, actual: str):
+        self.sequence = sequence
+        self.expected = expected
+        self.actual = actual
+        super().__init__(
+            f"Chunk {sequence}: checksum mismatch "
+            f"(expected={expected}, actual={actual})"
+        )
+
+
+class ProcessingError(PipelineError):
+    """Errors during chunk processing by workers."""
+    pass
+
+
+class ProcessorTimeoutError(ProcessingError):
+    """Processor exceeded allowed time for a chunk."""
+
+    def __init__(self, sequence: int, timeout: float):
+        self.sequence = sequence
+        self.timeout = timeout
+        super().__init__(f"Chunk {sequence}: processor timed out after {timeout}s")
+
+
+class ProcessorFailureError(ProcessingError):
+    """Processor failed to process a chunk after all retries."""
+
+    def __init__(self, sequence: int, retries: int, original_error: Exception):
+        self.sequence = sequence
+        self.retries = retries
+        self.original_error = original_error
+        super().__init__(
+            f"Chunk {sequence}: failed after {retries} retries — {original_error}"
+        )
+
+
+class ReassemblyError(PipelineError):
+    """Errors during result collection and ordering."""
+    pass
--- a/core/chunker/models.py
+++ b/core/chunker/models.py
@@ -0,0 +1,54 @@
+"""
+Internal data models for the chunker pipeline.
+
+These are pipeline-internal dataclasses, not schema models.
+Schema-level ChunkJob is in core/schema/models/jobs.py.
+
+Demonstrates: Core data structures (Interview Topic 5).
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+
+@dataclass
+class Chunk:
+    """A time-based segment of the source media file."""
+
+    sequence: int
+    start_time: float  # seconds
+    end_time: float  # seconds
+    source_path: str  # path to source file
+    duration: float  # end_time - start_time
+    checksum: str = ""  # computed after extraction
+
+
+@dataclass
+class ChunkResult:
+    """Result of processing a single chunk."""
+
+    sequence: int
+    success: bool
+    checksum_valid: bool = True
+    processing_time: float = 0.0
+    error: Optional[str] = None
+    retries: int = 0
+    worker_id: Optional[str] = None
+    output_file: Optional[str] = None
+
+
+@dataclass
+class PipelineResult:
+    """Aggregate result of the entire pipeline run."""
+
+    total_chunks: int = 0
+    processed: int = 0
+    failed: int = 0
+    retries: int = 0
+    elapsed_time: float = 0.0
+    throughput_mbps: float = 0.0
+    worker_stats: Dict[str, Any] = field(default_factory=dict)
+    errors: List[str] = field(default_factory=list)
+    chunks_in_order: bool = True
+    output_dir: Optional[str] = None
+    chunk_files: List[str] = field(default_factory=list)
--- a/core/chunker/pipeline.py
+++ b/core/chunker/pipeline.py
@@ -0,0 +1,244 @@
+"""
+Pipeline — orchestrates the entire chunker pipeline.
+
+Wires: Chunker → ChunkQueue → WorkerPool → ResultCollector → PipelineResult
+
+Demonstrates:
+- Function parameters and defaults (Interview Topic 1) — configurable pipeline
+- Concurrency (Interview Topic 2) — producer thread + worker pool
+- OOP design (Interview Topic 4) — composition of pipeline components
+- Exception handling (Interview Topic 7) — graceful error propagation
+"""
+
+import json
+import logging
+import threading
+import time
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional
+
+from .chunker import Chunker
+from .collector import ResultCollector
+from .exceptions import PipelineError
+from .models import PipelineResult
+from .pool import WorkerPool
+from .queue import ChunkQueue
+
+logger = logging.getLogger(__name__)
+
+
+class Pipeline:
+    """
+    Orchestrates the chunk processing pipeline.
+
+    The pipeline runs in three stages:
+    1. Producer thread: Chunker probes file → pushes time-based chunks to ChunkQueue
+    2. Worker pool: N workers pull from queue → extract mp4 segments → emit results
+    3. Collector: ResultCollector reassembles results in sequence order
+
+    Args:
+        source: Path to the source media file
+        chunk_duration: Duration of each chunk in seconds (default: 10.0)
+        num_workers: Number of concurrent worker threads (default: 4)
+        max_retries: Max retry attempts per chunk (default: 3)
+        processor_type: Processor to use — "ffmpeg", "checksum", "simulated_decode", "composite"
+        queue_size: Max chunks buffered in queue (default: 10)
+        event_callback: Optional callback for real-time events
+        output_dir: Directory for output chunk files (required for "ffmpeg" processor)
+    """
+
+    def __init__(
+        self,
+        source: str,
+        chunk_duration: float = 10.0,
+        num_workers: int = 4,
+        max_retries: int = 3,
+        processor_type: str = "checksum",
+        queue_size: int = 10,
+        event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
+        output_dir: Optional[str] = None,
+    ):
+        self.source = source
+        self.chunk_duration = chunk_duration
+        self.num_workers = num_workers
+        self.max_retries = max_retries
+        self.processor_type = processor_type
+        self.queue_size = queue_size
+        self.event_callback = event_callback
+        self.output_dir = output_dir
+
+    def _emit(self, event_type: str, data: Dict[str, Any]) -> None:
+        """Emit an event if callback is registered."""
+        if self.event_callback:
+            self.event_callback(event_type, data)
+
+    def _produce_chunks(
+        self, chunker: Chunker, chunk_queue: ChunkQueue
+    ) -> None:
+        """Producer thread: probe file and enqueue time-based chunks."""
+        try:
+            for chunk in chunker.chunks():
+                chunk_queue.put(chunk, timeout=30.0)
+                self._emit("chunk_queued", {
+                    "sequence": chunk.sequence,
+                    "start_time": chunk.start_time,
+                    "end_time": chunk.end_time,
+                    "duration": chunk.duration,
+                    "queue_size": chunk_queue.qsize(),
+                })
+        except Exception as e:
+            logger.error(f"Producer error: {e}")
+            self._emit("producer_error", {"error": str(e)})
+        finally:
+            chunk_queue.close()
+
+    def _write_manifest(
+        self, result: PipelineResult, source_duration: float
+    ) -> None:
+        """Write manifest.json to output_dir with segment metadata."""
+        if not self.output_dir:
+            return
+
+        manifest = {
+            "source": self.source,
+            "source_duration": source_duration,
+            "chunk_duration": self.chunk_duration,
+            "total_chunks": result.total_chunks,
+            "processed": result.processed,
+            "failed": result.failed,
+            "elapsed_time": result.elapsed_time,
+            "throughput_mbps": result.throughput_mbps,
+            "segments": [
+                {
+                    "sequence": i,
+                    "file": f"chunk_{i:04d}.mp4",
+                    "start": i * self.chunk_duration,
+                    "end": min(
+                        (i + 1) * self.chunk_duration, source_duration
+                    ),
+                }
+                for i in range(result.total_chunks)
+                if i < result.total_chunks
+            ],
+        }
+
+        manifest_path = Path(self.output_dir) / "manifest.json"
+        manifest_path.write_text(json.dumps(manifest, indent=2))
+        logger.info(f"Manifest written to {manifest_path}")
+
+    def run(self) -> PipelineResult:
+        """
+        Execute the full pipeline.
+
+        Returns:
+            PipelineResult with aggregate stats
+
+        Raises:
+            PipelineError: If the pipeline fails catastrophically
+        """
+        start_time = time.monotonic()
+        self._emit("pipeline_start", {
+            "source": self.source,
+            "chunk_duration": self.chunk_duration,
+            "num_workers": self.num_workers,
+            "processor_type": self.processor_type,
+        })
+
+        try:
+            # Stage 1: Set up chunker (probes file for duration)
+            chunker = Chunker(self.source, self.chunk_duration)
+            total_chunks = chunker.expected_chunks
+
+            if total_chunks == 0:
+                self._emit("pipeline_complete", {"total_chunks": 0})
+                return PipelineResult(chunks_in_order=True)
+
+            self._emit("pipeline_info", {
+                "file_size": chunker.file_size,
+                "source_duration": chunker.source_duration,
+                "total_chunks": total_chunks,
+            })
+
+            # Stage 2: Set up queue and worker pool
+            chunk_queue = ChunkQueue(maxsize=self.queue_size)
+            pool = WorkerPool(
+                num_workers=self.num_workers,
+                chunk_queue=chunk_queue,
+                processor_type=self.processor_type,
+                max_retries=self.max_retries,
+                event_callback=self.event_callback,
+                output_dir=self.output_dir,
+            )
+
+            # Stage 3: Start workers, then produce chunks
+            pool.start()
+
+            producer = threading.Thread(
+                target=self._produce_chunks,
+                args=(chunker, chunk_queue),
+                name="chunk-producer",
+                daemon=True,
+            )
+            producer.start()
+
+            # Stage 4: Wait for all workers to finish
+            all_results = pool.wait()
+            producer.join(timeout=5.0)
+
+            # Stage 5: Collect results in order
+            collector = ResultCollector(total_chunks)
+            for r in all_results:
+                collector.add(r)
+                self._emit("chunk_collected", {
+                    "sequence": r.sequence,
+                    "success": r.success,
+                    "buffered": collector.buffered_count,
+                    "emitted": collector.emitted_count,
+                })
+
+            # Build result
+            elapsed = time.monotonic() - start_time
+            file_size_mb = chunker.file_size / (1024 * 1024)
+            throughput = file_size_mb / elapsed if elapsed > 0 else 0.0
+
+            failed_results = [r for r in all_results if not r.success]
+            total_retries = sum(r.retries for r in all_results)
+            chunk_files = [
+                r.output_file for r in all_results
+                if r.success and r.output_file
+            ]
+
+            result = PipelineResult(
+                total_chunks=total_chunks,
+                processed=len(all_results),
+                failed=len(failed_results),
+                retries=total_retries,
+                elapsed_time=elapsed,
+                throughput_mbps=throughput,
+                worker_stats=pool.get_worker_stats(),
+                errors=[r.error for r in failed_results if r.error],
+                chunks_in_order=collector.is_complete,
+                output_dir=self.output_dir,
+                chunk_files=chunk_files,
+            )
+
+            # Write manifest if output_dir is set
+            self._write_manifest(result, chunker.source_duration)
+
+            pool.shutdown()
+
+            self._emit("pipeline_complete", {
+                "total_chunks": result.total_chunks,
+                "processed": result.processed,
+                "failed": result.failed,
+                "elapsed": result.elapsed_time,
+                "throughput_mbps": result.throughput_mbps,
+            })
+
+            return result
+
+        except PipelineError:
+            raise
+        except Exception as e:
+            self._emit("pipeline_error", {"error": str(e)})
+            raise PipelineError(f"Pipeline failed: {e}") from e
--- a/core/chunker/pool.py
+++ b/core/chunker/pool.py
@@ -0,0 +1,125 @@
+"""
+WorkerPool — manages N worker threads via ThreadPoolExecutor.
+
+Demonstrates: Python concurrency — threading (Interview Topic 2).
+"""
+
+import logging
+import threading
+from concurrent.futures import Future, ThreadPoolExecutor
+from typing import Any, Callable, Dict, List, Optional
+
+from .models import ChunkResult
+from .processor import (
+    ChecksumProcessor,
+    CompositeProcessor,
+    FFmpegExtractProcessor,
+    Processor,
+    SimulatedDecodeProcessor,
+)
+from .queue import ChunkQueue
+from .worker import Worker
+
+logger = logging.getLogger(__name__)
+
+
+def create_processor(
+    processor_type: str = "checksum",
+    output_dir: Optional[str] = None,
+) -> Processor:
+    """Factory for processor instances."""
+    if processor_type == "ffmpeg":
+        if not output_dir:
+            raise ValueError("output_dir required for ffmpeg processor")
+        return FFmpegExtractProcessor(output_dir=output_dir)
+    elif processor_type == "checksum":
+        return ChecksumProcessor()
+    elif processor_type == "simulated_decode":
+        return SimulatedDecodeProcessor()
+    elif processor_type == "composite":
+        return CompositeProcessor([
+            ChecksumProcessor(),
+            SimulatedDecodeProcessor(ms_per_second=50.0),
+        ])
+    else:
+        raise ValueError(f"Unknown processor type: {processor_type}")
+
+
+class WorkerPool:
+    """
+    Manages N worker threads that process chunks concurrently.
+
+    Args:
+        num_workers: Number of concurrent worker threads (default: 4)
+        chunk_queue: Shared queue to pull chunks from
+        processor_type: Type of processor for each worker (default: "checksum")
+        max_retries: Max retry attempts per chunk (default: 3)
+        event_callback: Optional callback for real-time events
+    """
+
+    def __init__(
+        self,
+        num_workers: int = 4,
+        chunk_queue: Optional[ChunkQueue] = None,
+        processor_type: str = "checksum",
+        max_retries: int = 3,
+        event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
+        output_dir: Optional[str] = None,
+    ):
+        self.num_workers = num_workers
+        self.chunk_queue = chunk_queue or ChunkQueue()
+        self.processor_type = processor_type
+        self.max_retries = max_retries
+        self.event_callback = event_callback
+        self.output_dir = output_dir
+        self.shutdown_event = threading.Event()
+        self._executor: Optional[ThreadPoolExecutor] = None
+        self._futures: List[Future] = []
+        self._workers: List[Worker] = []
+
+    def start(self) -> None:
+        """Start all worker threads."""
+        self._executor = ThreadPoolExecutor(
+            max_workers=self.num_workers,
+            thread_name_prefix="chunk-worker",
+        )
+
+        for i in range(self.num_workers):
+            worker = Worker(
+                worker_id=f"worker-{i}",
+                chunk_queue=self.chunk_queue,
+                processor=create_processor(self.processor_type, output_dir=self.output_dir),
+                max_retries=self.max_retries,
+                event_callback=self.event_callback,
+            )
+            self._workers.append(worker)
+            future = self._executor.submit(worker.run)
+            self._futures.append(future)
+
+        logger.info(f"WorkerPool started with {self.num_workers} workers")
+
+    def wait(self) -> List[ChunkResult]:
+        """Wait for all workers to finish and collect results."""
+        all_results = []
+        for future in self._futures:
+            results = future.result()
+            all_results.extend(results)
+        return all_results
+
+    def shutdown(self) -> None:
+        """Signal shutdown and cleanup."""
+        self.shutdown_event.set()
+        self.chunk_queue.close()
+        if self._executor:
+            self._executor.shutdown(wait=True)
+
+    def get_worker_stats(self) -> Dict[str, Any]:
+        """Get per-worker statistics."""
+        return {
+            w.worker_id: {
+                "processed": w.processed_count,
+                "errors": w.error_count,
+                "retries": w.retry_count,
+            }
+            for w in self._workers
+        }
--- a/core/chunker/processor.py
+++ b/core/chunker/processor.py
@@ -0,0 +1,173 @@
+"""
+Processor ABC and concrete implementations.
+
+Demonstrates: OOP design principles — ABC, inheritance, composition (Interview Topic 4).
+"""
+
+import hashlib
+import time
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List
+
+from .exceptions import ChunkChecksumError
+from .models import Chunk, ChunkResult
+
+
+class Processor(ABC):
+    """
+    Abstract base class for chunk processors.
+
+    Each processor defines how a single chunk is processed.
+    The Worker calls processor.process(chunk) and handles retries.
+    """
+
+    @abstractmethod
+    def process(self, chunk: Chunk) -> ChunkResult:
+        """Process a single chunk and return the result."""
+        pass
+
+
+class FFmpegExtractProcessor(Processor):
+    """
+    Extracts a time segment from the source file using FFmpeg stream copy.
+
+    Produces a playable mp4 file per chunk — no re-encoding.
+
+    Args:
+        output_dir: Directory to write chunk mp4 files
+    """
+
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    def process(self, chunk: Chunk) -> ChunkResult:
+        from core.ffmpeg.transcode import TranscodeConfig, transcode
+
+        start = time.monotonic()
+
+        output_file = str(
+            Path(self.output_dir) / f"chunk_{chunk.sequence:04d}.mp4"
+        )
+
+        config = TranscodeConfig(
+            input_path=chunk.source_path,
+            output_path=output_file,
+            video_codec="copy",
+            audio_codec="copy",
+            trim_start=chunk.start_time,
+            trim_end=chunk.end_time,
+        )
+
+        transcode(config)
+
+        # Compute checksum of output file
+        md5 = hashlib.md5()
+        with open(output_file, "rb") as f:
+            for block in iter(lambda: f.read(8192), b""):
+                md5.update(block)
+        checksum = md5.hexdigest()
+
+        elapsed = time.monotonic() - start
+
+        return ChunkResult(
+            sequence=chunk.sequence,
+            success=True,
+            checksum_valid=True,
+            processing_time=elapsed,
+            output_file=output_file,
+        )
+
+
+class ChecksumProcessor(Processor):
+    """
+    Validates chunk metadata consistency.
+
+    For time-based chunks, verifies the time range is valid.
+    Raises ChunkChecksumError on invalid ranges.
+    """
+
+    def process(self, chunk: Chunk) -> ChunkResult:
+        start = time.monotonic()
+
+        valid = chunk.duration > 0 and chunk.end_time > chunk.start_time
+
+        if not valid:
+            raise ChunkChecksumError(
+                sequence=chunk.sequence,
+                expected="valid time range",
+                actual=f"{chunk.start_time}-{chunk.end_time}",
+            )
+
+        elapsed = time.monotonic() - start
+
+        return ChunkResult(
+            sequence=chunk.sequence,
+            success=True,
+            checksum_valid=True,
+            processing_time=elapsed,
+        )
+
+
+class SimulatedDecodeProcessor(Processor):
+    """
+    Simulates decode work by sleeping proportional to chunk duration.
+
+    Useful for demonstrating concurrency behavior without real FFmpeg.
+
+    Args:
+        ms_per_second: Milliseconds of simulated work per second of chunk duration (default: 100)
+    """
+
+    def __init__(self, ms_per_second: float = 100.0):
+        self.ms_per_second = ms_per_second
+
+    def process(self, chunk: Chunk) -> ChunkResult:
+        start = time.monotonic()
+
+        sleep_time = (self.ms_per_second * chunk.duration) / 1000.0
+        time.sleep(sleep_time)
+
+        elapsed = time.monotonic() - start
+
+        return ChunkResult(
+            sequence=chunk.sequence,
+            success=True,
+            checksum_valid=True,
+            processing_time=elapsed,
+        )
+
+
+class CompositeProcessor(Processor):
+    """
+    Chains multiple processors — runs each in sequence on the same chunk.
+
+    Demonstrates OOP composition pattern.
+
+    Args:
+        processors: List of processors to chain
+    """
+
+    def __init__(self, processors: List[Processor]):
+        if not processors:
+            raise ValueError("CompositeProcessor requires at least one processor")
+        self.processors = processors
+
+    def process(self, chunk: Chunk) -> ChunkResult:
+        start = time.monotonic()
+        last_result = None
+
+        for proc in self.processors:
+            last_result = proc.process(chunk)
+            if not last_result.success:
+                return last_result
+
+        elapsed = time.monotonic() - start
+
+        return ChunkResult(
+            sequence=chunk.sequence,
+            success=True,
+            checksum_valid=last_result.checksum_valid if last_result else True,
+            processing_time=elapsed,
+        )
--- a/core/chunker/queue.py
+++ b/core/chunker/queue.py
@@ -0,0 +1,76 @@
+"""
+ChunkQueue — bounded, thread-safe queue with sentinel-based shutdown.
+
+Demonstrates: Core data structures — queue.Queue (Interview Topic 5).
+"""
+
+import queue
+from typing import Optional
+
+from .models import Chunk
+
+# Sentinel value to signal workers to stop
+_SENTINEL = object()
+
+
+class ChunkQueue:
+    """
+    Thread-safe bounded queue for chunks.
+
+    Provides backpressure: producers block when the queue is full,
+    preventing unbounded memory usage.
+
+    Args:
+        maxsize: Maximum number of chunks in the queue (default: 10)
+    """
+
+    def __init__(self, maxsize: int = 10):
+        self._queue: queue.Queue = queue.Queue(maxsize=maxsize)
+        self._closed = False
+        self.maxsize = maxsize
+
+    def put(self, chunk: Chunk, timeout: Optional[float] = None) -> None:
+        """
+        Add a chunk to the queue. Blocks if full (backpressure).
+
+        Args:
+            chunk: The chunk to enqueue
+            timeout: Max seconds to wait (None = block forever)
+
+        Raises:
+            queue.Full: If timeout expires while queue is full
+        """
+        self._queue.put(chunk, timeout=timeout)
+
+    def get(self, timeout: Optional[float] = None) -> Optional[Chunk]:
+        """
+        Get next chunk from queue. Returns None if queue is closed.
+
+        Args:
+            timeout: Max seconds to wait (None = block forever)
+
+        Returns:
+            Chunk or None (if sentinel received, meaning queue is closed)
+
+        Raises:
+            queue.Empty: If timeout expires while queue is empty
+        """
+        item = self._queue.get(timeout=timeout)
+        if item is _SENTINEL:
+            # Re-put sentinel so other workers also see it
+            self._queue.put(_SENTINEL)
+            return None
+        return item
+
+    def close(self) -> None:
+        """Signal all consumers to stop by inserting a sentinel."""
+        self._closed = True
+        self._queue.put(_SENTINEL)
+
+    @property
+    def is_closed(self) -> bool:
+        return self._closed
+
+    def qsize(self) -> int:
+        """Current number of items in the queue (approximate)."""
+        return self._queue.qsize()
--- a/core/chunker/worker.py
+++ b/core/chunker/worker.py
@@ -0,0 +1,141 @@
+"""
+Worker — pulls chunks from queue, processes with retry logic.
+
+Demonstrates:
+- Exception handling and resilient code (Interview Topic 7)
+- Concurrency (Interview Topic 2) — workers run in thread pool
+"""
+
+import logging
+import queue
+import time
+from typing import Any, Callable, Dict, Optional
+
+from .exceptions import ProcessorFailureError
+from .models import Chunk, ChunkResult
+from .processor import Processor
+from .queue import ChunkQueue
+
+logger = logging.getLogger(__name__)
+
+
+class Worker:
+    """
+    Processes chunks from a queue with retry and exponential backoff.
+
+    Args:
+        worker_id: Identifier for this worker (e.g. "worker-0")
+        chunk_queue: Source queue to pull chunks from
+        processor: Processor instance to use
+        max_retries: Maximum retry attempts per chunk (default: 3)
+        event_callback: Optional callback for real-time status updates
+    """
+
+    def __init__(
+        self,
+        worker_id: str,
+        chunk_queue: ChunkQueue,
+        processor: Processor,
+        max_retries: int = 3,
+        event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
+    ):
+        self.worker_id = worker_id
+        self.chunk_queue = chunk_queue
+        self.processor = processor
+        self.max_retries = max_retries
+        self.event_callback = event_callback
+        self.processed_count = 0
+        self.error_count = 0
+        self.retry_count = 0
+
+    def _emit(self, event_type: str, data: Dict[str, Any]) -> None:
+        """Emit an event if callback is registered."""
+        if self.event_callback:
+            self.event_callback(event_type, {"worker_id": self.worker_id, **data})
+
+    def _process_with_retry(self, chunk: Chunk) -> ChunkResult:
+        """
+        Process a chunk with exponential backoff retry.
+
+        Retry delays: 0.1s, 0.2s, 0.4s, ... (doubles each attempt)
+        """
+        last_error = None
+
+        for attempt in range(self.max_retries + 1):
+            try:
+                if attempt > 0:
+                    backoff = 0.1 * (2 ** (attempt - 1))
+                    self._emit("chunk_retry", {
+                        "sequence": chunk.sequence,
+                        "attempt": attempt,
+                        "backoff": backoff,
+                    })
+                    time.sleep(backoff)
+                    self.retry_count += 1
+
+                result = self.processor.process(chunk)
+                result.retries = attempt
+                result.worker_id = self.worker_id
+                return result
+
+            except Exception as e:
+                last_error = e
+                logger.warning(
+                    f"{self.worker_id}: chunk {chunk.sequence} "
+                    f"attempt {attempt + 1}/{self.max_retries + 1} failed: {e}"
+                )
+
+        # All retries exhausted
+        self.error_count += 1
+        self._emit("chunk_error", {
+            "sequence": chunk.sequence,
+            "error": str(last_error),
+            "retries": self.max_retries,
+        })
+
+        return ChunkResult(
+            sequence=chunk.sequence,
+            success=False,
+            processing_time=0.0,
+            error=str(last_error),
+            retries=self.max_retries,
+            worker_id=self.worker_id,
+        )
+
+    def run(self) -> list[ChunkResult]:
+        """
+        Main worker loop — pull chunks and process until queue is closed.
+
+        Returns:
+            List of ChunkResults processed by this worker
+        """
+        results = []
+        self._emit("worker_status", {"state": "idle"})
+
+        while True:
+            try:
+                chunk = self.chunk_queue.get(timeout=1.0)
+            except queue.Empty:
+                continue
+
+            if chunk is None:  # Sentinel received
+                break
+
+            self._emit("chunk_processing", {
+                "sequence": chunk.sequence,
+                "state": "processing",
+            })
+
+            result = self._process_with_retry(chunk)
+            results.append(result)
+            self.processed_count += 1
+
+            self._emit("chunk_done", {
+                "sequence": chunk.sequence,
+                "success": result.success,
+                "processing_time": result.processing_time,
+                "retries": result.retries,
+            })
+
+        self._emit("worker_status", {"state": "stopped"})
+        return results