chunker and ui

This commit is contained in:
2026-03-13 14:29:38 -03:00
parent 3eeedebb15
commit ccc478fbaa
69 changed files with 6481 additions and 282 deletions

64
core/chunker/__init__.py Normal file
View File

@@ -0,0 +1,64 @@
"""
Chunker pipeline — splits files into chunks, processes concurrently, reassembles in order.
Public API:
Pipeline — orchestrates the full pipeline
PipelineResult — aggregate result dataclass
Chunker — file → Chunk generator
ChunkQueue — bounded thread-safe queue
WorkerPool — manages N worker threads
ResultCollector — heapq-based ordered reassembly
"""
from .chunker import Chunker
from .collector import ResultCollector
from .exceptions import (
ChunkChecksumError,
ChunkError,
ChunkReadError,
PipelineError,
ProcessingError,
ProcessorFailureError,
ProcessorTimeoutError,
ReassemblyError,
)
from .models import Chunk, ChunkResult, PipelineResult
from .pipeline import Pipeline
from .pool import WorkerPool
from .processor import (
ChecksumProcessor,
CompositeProcessor,
FFmpegExtractProcessor,
Processor,
SimulatedDecodeProcessor,
)
from .queue import ChunkQueue
__all__ = [
# Core
"Pipeline",
"PipelineResult",
# Components
"Chunker",
"ChunkQueue",
"WorkerPool",
"ResultCollector",
# Models
"Chunk",
"ChunkResult",
# Processors
"Processor",
"ChecksumProcessor",
"SimulatedDecodeProcessor",
"CompositeProcessor",
"FFmpegExtractProcessor",
# Exceptions
"PipelineError",
"ChunkError",
"ChunkReadError",
"ChunkChecksumError",
"ProcessingError",
"ProcessorFailureError",
"ProcessorTimeoutError",
"ReassemblyError",
]

86
core/chunker/chunker.py Normal file
View File

@@ -0,0 +1,86 @@
"""
Chunker — probes a media file and yields time-based Chunk objects.
Demonstrates:
- Function parameters and defaults (Interview Topic 1)
- List comprehensions and efficient iteration / generators (Interview Topic 3)
"""
import math
import os
from typing import Generator
from core.ffmpeg.probe import probe_file
from .exceptions import ChunkReadError
from .models import Chunk
class Chunker:
"""
Splits a media file into time-based chunks via a generator.
Uses FFmpeg probe to get duration, then yields Chunk objects
representing time segments (no data read — extraction happens in the processor).
Args:
file_path: Path to the source media file
chunk_duration: Duration of each chunk in seconds (default: 10.0)
"""
def __init__(self, file_path: str, chunk_duration: float = 10.0):
if not os.path.isfile(file_path):
raise ChunkReadError(f"File not found: {file_path}")
if chunk_duration <= 0:
raise ValueError("chunk_duration must be positive")
self.file_path = file_path
self.chunk_duration = chunk_duration
self.file_size = os.path.getsize(file_path)
self.source_duration = self._probe_duration()
def _probe_duration(self) -> float:
"""Get source file duration via FFmpeg probe."""
try:
result = probe_file(self.file_path)
if result.duration is None or result.duration <= 0:
raise ChunkReadError(
f"Cannot determine duration for {self.file_path}"
)
return result.duration
except ChunkReadError:
raise
except Exception as e:
raise ChunkReadError(
f"Failed to probe {self.file_path}: {e}"
) from e
@property
def expected_chunks(self) -> int:
"""Calculate expected number of chunks (last chunk may be shorter)."""
if self.source_duration <= 0:
return 0
return math.ceil(self.source_duration / self.chunk_duration)
def chunks(self) -> Generator[Chunk, None, None]:
"""
Yield Chunk objects representing time segments of the source file.
Generator-based: chunks are yielded on demand.
Each chunk defines a time range — actual extraction is done by the processor.
"""
total = self.expected_chunks
for sequence in range(total):
start_time = sequence * self.chunk_duration
end_time = min(
start_time + self.chunk_duration, self.source_duration
)
duration = end_time - start_time
yield Chunk(
sequence=sequence,
start_time=start_time,
end_time=end_time,
source_path=self.file_path,
duration=duration,
)

98
core/chunker/collector.py Normal file
View File

@@ -0,0 +1,98 @@
"""
ResultCollector — reassembles chunk results in sequence order using a min-heap.
Demonstrates:
- Algorithms and sorting (Interview Topic 6) — heapq for ordered reassembly
- Core data structures (Interview Topic 5) — heap, deque
"""
import heapq
from collections import deque
from typing import List
from .exceptions import ReassemblyError
from .models import ChunkResult
class ResultCollector:
"""
Receives ChunkResults out of order, emits them in sequence order.
Uses a min-heap keyed on sequence number. Only emits a chunk when
all prior sequences have been accounted for.
Args:
total_chunks: Expected total number of chunks
"""
def __init__(self, total_chunks: int):
self.total_chunks = total_chunks
self._heap: List[tuple[int, ChunkResult]] = []
self._next_sequence = 0
self._emitted: List[ChunkResult] = []
self._seen_sequences: set[int] = set()
# Sliding window for throughput calculation
self._recent_times: deque[float] = deque(maxlen=50)
def add(self, result: ChunkResult) -> List[ChunkResult]:
"""
Add a result and return any newly emittable results in order.
Args:
result: A ChunkResult (may arrive out of order)
Returns:
List of results that can now be emitted in sequence order
(may be empty if we're still waiting for earlier sequences)
Raises:
ReassemblyError: If a duplicate sequence is received
"""
if result.sequence in self._seen_sequences:
raise ReassemblyError(
f"Duplicate sequence number: {result.sequence}"
)
self._seen_sequences.add(result.sequence)
# Track processing time for throughput
if result.processing_time > 0:
self._recent_times.append(result.processing_time)
# Push to min-heap
heapq.heappush(self._heap, (result.sequence, result))
# Emit all consecutive results starting from _next_sequence
newly_emitted = []
while self._heap and self._heap[0][0] == self._next_sequence:
_, emitted_result = heapq.heappop(self._heap)
self._emitted.append(emitted_result)
newly_emitted.append(emitted_result)
self._next_sequence += 1
return newly_emitted
@property
def is_complete(self) -> bool:
"""True if all expected chunks have been emitted in order."""
return self._next_sequence == self.total_chunks
@property
def buffered_count(self) -> int:
"""Number of results waiting in the heap (arrived out of order)."""
return len(self._heap)
@property
def emitted_count(self) -> int:
"""Number of results emitted in sequence order."""
return len(self._emitted)
@property
def avg_processing_time(self) -> float:
"""Average processing time from recent results (sliding window)."""
if not self._recent_times:
return 0.0
return sum(self._recent_times) / len(self._recent_times)
def get_ordered_results(self) -> List[ChunkResult]:
"""Get all emitted results in sequence order."""
return list(self._emitted)

View File

@@ -0,0 +1,64 @@
"""
Chunker exception hierarchy.
Demonstrates: Managing exceptions and writing resilient code (Interview Topic 7).
"""
class PipelineError(Exception):
"""Base exception for all chunker pipeline errors."""
pass
class ChunkError(PipelineError):
"""Errors related to chunk creation or validation."""
pass
class ChunkReadError(ChunkError):
"""Failed to read chunk data from source file."""
pass
class ChunkChecksumError(ChunkError):
"""Chunk data integrity validation failed."""
def __init__(self, sequence: int, expected: str, actual: str):
self.sequence = sequence
self.expected = expected
self.actual = actual
super().__init__(
f"Chunk {sequence}: checksum mismatch "
f"(expected={expected}, actual={actual})"
)
class ProcessingError(PipelineError):
"""Errors during chunk processing by workers."""
pass
class ProcessorTimeoutError(ProcessingError):
"""Processor exceeded allowed time for a chunk."""
def __init__(self, sequence: int, timeout: float):
self.sequence = sequence
self.timeout = timeout
super().__init__(f"Chunk {sequence}: processor timed out after {timeout}s")
class ProcessorFailureError(ProcessingError):
"""Processor failed to process a chunk after all retries."""
def __init__(self, sequence: int, retries: int, original_error: Exception):
self.sequence = sequence
self.retries = retries
self.original_error = original_error
super().__init__(
f"Chunk {sequence}: failed after {retries} retries — {original_error}"
)
class ReassemblyError(PipelineError):
"""Errors during result collection and ordering."""
pass

54
core/chunker/models.py Normal file
View File

@@ -0,0 +1,54 @@
"""
Internal data models for the chunker pipeline.
These are pipeline-internal dataclasses, not schema models.
Schema-level ChunkJob is in core/schema/models/jobs.py.
Demonstrates: Core data structures (Interview Topic 5).
"""
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
@dataclass
class Chunk:
"""A time-based segment of the source media file."""
sequence: int
start_time: float # seconds
end_time: float # seconds
source_path: str # path to source file
duration: float # end_time - start_time
checksum: str = "" # computed after extraction
@dataclass
class ChunkResult:
"""Result of processing a single chunk."""
sequence: int
success: bool
checksum_valid: bool = True
processing_time: float = 0.0
error: Optional[str] = None
retries: int = 0
worker_id: Optional[str] = None
output_file: Optional[str] = None
@dataclass
class PipelineResult:
"""Aggregate result of the entire pipeline run."""
total_chunks: int = 0
processed: int = 0
failed: int = 0
retries: int = 0
elapsed_time: float = 0.0
throughput_mbps: float = 0.0
worker_stats: Dict[str, Any] = field(default_factory=dict)
errors: List[str] = field(default_factory=list)
chunks_in_order: bool = True
output_dir: Optional[str] = None
chunk_files: List[str] = field(default_factory=list)

244
core/chunker/pipeline.py Normal file
View File

@@ -0,0 +1,244 @@
"""
Pipeline — orchestrates the entire chunker pipeline.
Wires: Chunker → ChunkQueue → WorkerPool → ResultCollector → PipelineResult
Demonstrates:
- Function parameters and defaults (Interview Topic 1) — configurable pipeline
- Concurrency (Interview Topic 2) — producer thread + worker pool
- OOP design (Interview Topic 4) — composition of pipeline components
- Exception handling (Interview Topic 7) — graceful error propagation
"""
import json
import logging
import threading
import time
from pathlib import Path
from typing import Any, Callable, Dict, Optional
from .chunker import Chunker
from .collector import ResultCollector
from .exceptions import PipelineError
from .models import PipelineResult
from .pool import WorkerPool
from .queue import ChunkQueue
logger = logging.getLogger(__name__)
class Pipeline:
"""
Orchestrates the chunk processing pipeline.
The pipeline runs in three stages:
1. Producer thread: Chunker probes file → pushes time-based chunks to ChunkQueue
2. Worker pool: N workers pull from queue → extract mp4 segments → emit results
3. Collector: ResultCollector reassembles results in sequence order
Args:
source: Path to the source media file
chunk_duration: Duration of each chunk in seconds (default: 10.0)
num_workers: Number of concurrent worker threads (default: 4)
max_retries: Max retry attempts per chunk (default: 3)
processor_type: Processor to use — "ffmpeg", "checksum", "simulated_decode", "composite"
queue_size: Max chunks buffered in queue (default: 10)
event_callback: Optional callback for real-time events
output_dir: Directory for output chunk files (required for "ffmpeg" processor)
"""
def __init__(
self,
source: str,
chunk_duration: float = 10.0,
num_workers: int = 4,
max_retries: int = 3,
processor_type: str = "checksum",
queue_size: int = 10,
event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
output_dir: Optional[str] = None,
):
self.source = source
self.chunk_duration = chunk_duration
self.num_workers = num_workers
self.max_retries = max_retries
self.processor_type = processor_type
self.queue_size = queue_size
self.event_callback = event_callback
self.output_dir = output_dir
def _emit(self, event_type: str, data: Dict[str, Any]) -> None:
"""Emit an event if callback is registered."""
if self.event_callback:
self.event_callback(event_type, data)
def _produce_chunks(
self, chunker: Chunker, chunk_queue: ChunkQueue
) -> None:
"""Producer thread: probe file and enqueue time-based chunks."""
try:
for chunk in chunker.chunks():
chunk_queue.put(chunk, timeout=30.0)
self._emit("chunk_queued", {
"sequence": chunk.sequence,
"start_time": chunk.start_time,
"end_time": chunk.end_time,
"duration": chunk.duration,
"queue_size": chunk_queue.qsize(),
})
except Exception as e:
logger.error(f"Producer error: {e}")
self._emit("producer_error", {"error": str(e)})
finally:
chunk_queue.close()
def _write_manifest(
self, result: PipelineResult, source_duration: float
) -> None:
"""Write manifest.json to output_dir with segment metadata."""
if not self.output_dir:
return
manifest = {
"source": self.source,
"source_duration": source_duration,
"chunk_duration": self.chunk_duration,
"total_chunks": result.total_chunks,
"processed": result.processed,
"failed": result.failed,
"elapsed_time": result.elapsed_time,
"throughput_mbps": result.throughput_mbps,
"segments": [
{
"sequence": i,
"file": f"chunk_{i:04d}.mp4",
"start": i * self.chunk_duration,
"end": min(
(i + 1) * self.chunk_duration, source_duration
),
}
for i in range(result.total_chunks)
if i < result.total_chunks
],
}
manifest_path = Path(self.output_dir) / "manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))
logger.info(f"Manifest written to {manifest_path}")
def run(self) -> PipelineResult:
"""
Execute the full pipeline.
Returns:
PipelineResult with aggregate stats
Raises:
PipelineError: If the pipeline fails catastrophically
"""
start_time = time.monotonic()
self._emit("pipeline_start", {
"source": self.source,
"chunk_duration": self.chunk_duration,
"num_workers": self.num_workers,
"processor_type": self.processor_type,
})
try:
# Stage 1: Set up chunker (probes file for duration)
chunker = Chunker(self.source, self.chunk_duration)
total_chunks = chunker.expected_chunks
if total_chunks == 0:
self._emit("pipeline_complete", {"total_chunks": 0})
return PipelineResult(chunks_in_order=True)
self._emit("pipeline_info", {
"file_size": chunker.file_size,
"source_duration": chunker.source_duration,
"total_chunks": total_chunks,
})
# Stage 2: Set up queue and worker pool
chunk_queue = ChunkQueue(maxsize=self.queue_size)
pool = WorkerPool(
num_workers=self.num_workers,
chunk_queue=chunk_queue,
processor_type=self.processor_type,
max_retries=self.max_retries,
event_callback=self.event_callback,
output_dir=self.output_dir,
)
# Stage 3: Start workers, then produce chunks
pool.start()
producer = threading.Thread(
target=self._produce_chunks,
args=(chunker, chunk_queue),
name="chunk-producer",
daemon=True,
)
producer.start()
# Stage 4: Wait for all workers to finish
all_results = pool.wait()
producer.join(timeout=5.0)
# Stage 5: Collect results in order
collector = ResultCollector(total_chunks)
for r in all_results:
collector.add(r)
self._emit("chunk_collected", {
"sequence": r.sequence,
"success": r.success,
"buffered": collector.buffered_count,
"emitted": collector.emitted_count,
})
# Build result
elapsed = time.monotonic() - start_time
file_size_mb = chunker.file_size / (1024 * 1024)
throughput = file_size_mb / elapsed if elapsed > 0 else 0.0
failed_results = [r for r in all_results if not r.success]
total_retries = sum(r.retries for r in all_results)
chunk_files = [
r.output_file for r in all_results
if r.success and r.output_file
]
result = PipelineResult(
total_chunks=total_chunks,
processed=len(all_results),
failed=len(failed_results),
retries=total_retries,
elapsed_time=elapsed,
throughput_mbps=throughput,
worker_stats=pool.get_worker_stats(),
errors=[r.error for r in failed_results if r.error],
chunks_in_order=collector.is_complete,
output_dir=self.output_dir,
chunk_files=chunk_files,
)
# Write manifest if output_dir is set
self._write_manifest(result, chunker.source_duration)
pool.shutdown()
self._emit("pipeline_complete", {
"total_chunks": result.total_chunks,
"processed": result.processed,
"failed": result.failed,
"elapsed": result.elapsed_time,
"throughput_mbps": result.throughput_mbps,
})
return result
except PipelineError:
raise
except Exception as e:
self._emit("pipeline_error", {"error": str(e)})
raise PipelineError(f"Pipeline failed: {e}") from e

125
core/chunker/pool.py Normal file
View File

@@ -0,0 +1,125 @@
"""
WorkerPool — manages N worker threads via ThreadPoolExecutor.
Demonstrates: Python concurrency — threading (Interview Topic 2).
"""
import logging
import threading
from concurrent.futures import Future, ThreadPoolExecutor
from typing import Any, Callable, Dict, List, Optional
from .models import ChunkResult
from .processor import (
ChecksumProcessor,
CompositeProcessor,
FFmpegExtractProcessor,
Processor,
SimulatedDecodeProcessor,
)
from .queue import ChunkQueue
from .worker import Worker
logger = logging.getLogger(__name__)
def create_processor(
processor_type: str = "checksum",
output_dir: Optional[str] = None,
) -> Processor:
"""Factory for processor instances."""
if processor_type == "ffmpeg":
if not output_dir:
raise ValueError("output_dir required for ffmpeg processor")
return FFmpegExtractProcessor(output_dir=output_dir)
elif processor_type == "checksum":
return ChecksumProcessor()
elif processor_type == "simulated_decode":
return SimulatedDecodeProcessor()
elif processor_type == "composite":
return CompositeProcessor([
ChecksumProcessor(),
SimulatedDecodeProcessor(ms_per_second=50.0),
])
else:
raise ValueError(f"Unknown processor type: {processor_type}")
class WorkerPool:
"""
Manages N worker threads that process chunks concurrently.
Args:
num_workers: Number of concurrent worker threads (default: 4)
chunk_queue: Shared queue to pull chunks from
processor_type: Type of processor for each worker (default: "checksum")
max_retries: Max retry attempts per chunk (default: 3)
event_callback: Optional callback for real-time events
"""
def __init__(
self,
num_workers: int = 4,
chunk_queue: Optional[ChunkQueue] = None,
processor_type: str = "checksum",
max_retries: int = 3,
event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
output_dir: Optional[str] = None,
):
self.num_workers = num_workers
self.chunk_queue = chunk_queue or ChunkQueue()
self.processor_type = processor_type
self.max_retries = max_retries
self.event_callback = event_callback
self.output_dir = output_dir
self.shutdown_event = threading.Event()
self._executor: Optional[ThreadPoolExecutor] = None
self._futures: List[Future] = []
self._workers: List[Worker] = []
def start(self) -> None:
"""Start all worker threads."""
self._executor = ThreadPoolExecutor(
max_workers=self.num_workers,
thread_name_prefix="chunk-worker",
)
for i in range(self.num_workers):
worker = Worker(
worker_id=f"worker-{i}",
chunk_queue=self.chunk_queue,
processor=create_processor(self.processor_type, output_dir=self.output_dir),
max_retries=self.max_retries,
event_callback=self.event_callback,
)
self._workers.append(worker)
future = self._executor.submit(worker.run)
self._futures.append(future)
logger.info(f"WorkerPool started with {self.num_workers} workers")
def wait(self) -> List[ChunkResult]:
"""Wait for all workers to finish and collect results."""
all_results = []
for future in self._futures:
results = future.result()
all_results.extend(results)
return all_results
def shutdown(self) -> None:
"""Signal shutdown and cleanup."""
self.shutdown_event.set()
self.chunk_queue.close()
if self._executor:
self._executor.shutdown(wait=True)
def get_worker_stats(self) -> Dict[str, Any]:
"""Get per-worker statistics."""
return {
w.worker_id: {
"processed": w.processed_count,
"errors": w.error_count,
"retries": w.retry_count,
}
for w in self._workers
}

173
core/chunker/processor.py Normal file
View File

@@ -0,0 +1,173 @@
"""
Processor ABC and concrete implementations.
Demonstrates: OOP design principles — ABC, inheritance, composition (Interview Topic 4).
"""
import hashlib
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List
from .exceptions import ChunkChecksumError
from .models import Chunk, ChunkResult
class Processor(ABC):
"""
Abstract base class for chunk processors.
Each processor defines how a single chunk is processed.
The Worker calls processor.process(chunk) and handles retries.
"""
@abstractmethod
def process(self, chunk: Chunk) -> ChunkResult:
"""Process a single chunk and return the result."""
pass
class FFmpegExtractProcessor(Processor):
"""
Extracts a time segment from the source file using FFmpeg stream copy.
Produces a playable mp4 file per chunk — no re-encoding.
Args:
output_dir: Directory to write chunk mp4 files
"""
def __init__(self, output_dir: str):
self.output_dir = output_dir
Path(output_dir).mkdir(parents=True, exist_ok=True)
def process(self, chunk: Chunk) -> ChunkResult:
from core.ffmpeg.transcode import TranscodeConfig, transcode
start = time.monotonic()
output_file = str(
Path(self.output_dir) / f"chunk_{chunk.sequence:04d}.mp4"
)
config = TranscodeConfig(
input_path=chunk.source_path,
output_path=output_file,
video_codec="copy",
audio_codec="copy",
trim_start=chunk.start_time,
trim_end=chunk.end_time,
)
transcode(config)
# Compute checksum of output file
md5 = hashlib.md5()
with open(output_file, "rb") as f:
for block in iter(lambda: f.read(8192), b""):
md5.update(block)
checksum = md5.hexdigest()
elapsed = time.monotonic() - start
return ChunkResult(
sequence=chunk.sequence,
success=True,
checksum_valid=True,
processing_time=elapsed,
output_file=output_file,
)
class ChecksumProcessor(Processor):
"""
Validates chunk metadata consistency.
For time-based chunks, verifies the time range is valid.
Raises ChunkChecksumError on invalid ranges.
"""
def process(self, chunk: Chunk) -> ChunkResult:
start = time.monotonic()
valid = chunk.duration > 0 and chunk.end_time > chunk.start_time
if not valid:
raise ChunkChecksumError(
sequence=chunk.sequence,
expected="valid time range",
actual=f"{chunk.start_time}-{chunk.end_time}",
)
elapsed = time.monotonic() - start
return ChunkResult(
sequence=chunk.sequence,
success=True,
checksum_valid=True,
processing_time=elapsed,
)
class SimulatedDecodeProcessor(Processor):
"""
Simulates decode work by sleeping proportional to chunk duration.
Useful for demonstrating concurrency behavior without real FFmpeg.
Args:
ms_per_second: Milliseconds of simulated work per second of chunk duration (default: 100)
"""
def __init__(self, ms_per_second: float = 100.0):
self.ms_per_second = ms_per_second
def process(self, chunk: Chunk) -> ChunkResult:
start = time.monotonic()
sleep_time = (self.ms_per_second * chunk.duration) / 1000.0
time.sleep(sleep_time)
elapsed = time.monotonic() - start
return ChunkResult(
sequence=chunk.sequence,
success=True,
checksum_valid=True,
processing_time=elapsed,
)
class CompositeProcessor(Processor):
"""
Chains multiple processors — runs each in sequence on the same chunk.
Demonstrates OOP composition pattern.
Args:
processors: List of processors to chain
"""
def __init__(self, processors: List[Processor]):
if not processors:
raise ValueError("CompositeProcessor requires at least one processor")
self.processors = processors
def process(self, chunk: Chunk) -> ChunkResult:
start = time.monotonic()
last_result = None
for proc in self.processors:
last_result = proc.process(chunk)
if not last_result.success:
return last_result
elapsed = time.monotonic() - start
return ChunkResult(
sequence=chunk.sequence,
success=True,
checksum_valid=last_result.checksum_valid if last_result else True,
processing_time=elapsed,
)

76
core/chunker/queue.py Normal file
View File

@@ -0,0 +1,76 @@
"""
ChunkQueue — bounded, thread-safe queue with sentinel-based shutdown.
Demonstrates: Core data structures — queue.Queue (Interview Topic 5).
"""
import queue
from typing import Optional
from .models import Chunk
# Sentinel value to signal workers to stop
_SENTINEL = object()
class ChunkQueue:
"""
Thread-safe bounded queue for chunks.
Provides backpressure: producers block when the queue is full,
preventing unbounded memory usage.
Args:
maxsize: Maximum number of chunks in the queue (default: 10)
"""
def __init__(self, maxsize: int = 10):
self._queue: queue.Queue = queue.Queue(maxsize=maxsize)
self._closed = False
self.maxsize = maxsize
def put(self, chunk: Chunk, timeout: Optional[float] = None) -> None:
"""
Add a chunk to the queue. Blocks if full (backpressure).
Args:
chunk: The chunk to enqueue
timeout: Max seconds to wait (None = block forever)
Raises:
queue.Full: If timeout expires while queue is full
"""
self._queue.put(chunk, timeout=timeout)
def get(self, timeout: Optional[float] = None) -> Optional[Chunk]:
"""
Get next chunk from queue. Returns None if queue is closed.
Args:
timeout: Max seconds to wait (None = block forever)
Returns:
Chunk or None (if sentinel received, meaning queue is closed)
Raises:
queue.Empty: If timeout expires while queue is empty
"""
item = self._queue.get(timeout=timeout)
if item is _SENTINEL:
# Re-put sentinel so other workers also see it
self._queue.put(_SENTINEL)
return None
return item
def close(self) -> None:
"""Signal all consumers to stop by inserting a sentinel."""
self._closed = True
self._queue.put(_SENTINEL)
@property
def is_closed(self) -> bool:
return self._closed
def qsize(self) -> int:
"""Current number of items in the queue (approximate)."""
return self._queue.qsize()

141
core/chunker/worker.py Normal file
View File

@@ -0,0 +1,141 @@
"""
Worker — pulls chunks from queue, processes with retry logic.
Demonstrates:
- Exception handling and resilient code (Interview Topic 7)
- Concurrency (Interview Topic 2) — workers run in thread pool
"""
import logging
import queue
import time
from typing import Any, Callable, Dict, Optional
from .exceptions import ProcessorFailureError
from .models import Chunk, ChunkResult
from .processor import Processor
from .queue import ChunkQueue
logger = logging.getLogger(__name__)
class Worker:
"""
Processes chunks from a queue with retry and exponential backoff.
Args:
worker_id: Identifier for this worker (e.g. "worker-0")
chunk_queue: Source queue to pull chunks from
processor: Processor instance to use
max_retries: Maximum retry attempts per chunk (default: 3)
event_callback: Optional callback for real-time status updates
"""
def __init__(
self,
worker_id: str,
chunk_queue: ChunkQueue,
processor: Processor,
max_retries: int = 3,
event_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
):
self.worker_id = worker_id
self.chunk_queue = chunk_queue
self.processor = processor
self.max_retries = max_retries
self.event_callback = event_callback
self.processed_count = 0
self.error_count = 0
self.retry_count = 0
def _emit(self, event_type: str, data: Dict[str, Any]) -> None:
"""Emit an event if callback is registered."""
if self.event_callback:
self.event_callback(event_type, {"worker_id": self.worker_id, **data})
def _process_with_retry(self, chunk: Chunk) -> ChunkResult:
"""
Process a chunk with exponential backoff retry.
Retry delays: 0.1s, 0.2s, 0.4s, ... (doubles each attempt)
"""
last_error = None
for attempt in range(self.max_retries + 1):
try:
if attempt > 0:
backoff = 0.1 * (2 ** (attempt - 1))
self._emit("chunk_retry", {
"sequence": chunk.sequence,
"attempt": attempt,
"backoff": backoff,
})
time.sleep(backoff)
self.retry_count += 1
result = self.processor.process(chunk)
result.retries = attempt
result.worker_id = self.worker_id
return result
except Exception as e:
last_error = e
logger.warning(
f"{self.worker_id}: chunk {chunk.sequence} "
f"attempt {attempt + 1}/{self.max_retries + 1} failed: {e}"
)
# All retries exhausted
self.error_count += 1
self._emit("chunk_error", {
"sequence": chunk.sequence,
"error": str(last_error),
"retries": self.max_retries,
})
return ChunkResult(
sequence=chunk.sequence,
success=False,
processing_time=0.0,
error=str(last_error),
retries=self.max_retries,
worker_id=self.worker_id,
)
def run(self) -> list[ChunkResult]:
"""
Main worker loop — pull chunks and process until queue is closed.
Returns:
List of ChunkResults processed by this worker
"""
results = []
self._emit("worker_status", {"state": "idle"})
while True:
try:
chunk = self.chunk_queue.get(timeout=1.0)
except queue.Empty:
continue
if chunk is None: # Sentinel received
break
self._emit("chunk_processing", {
"sequence": chunk.sequence,
"state": "processing",
})
result = self._process_with_retry(chunk)
results.append(result)
self.processed_count += 1
self._emit("chunk_done", {
"sequence": chunk.sequence,
"success": result.success,
"processing_time": result.processing_time,
"retries": result.retries,
})
self._emit("worker_status", {"state": "stopped"})
return results