""" Chunker — probes a media file and yields time-based Chunk objects. Demonstrates: - Function parameters and defaults (Interview Topic 1) - List comprehensions and efficient iteration / generators (Interview Topic 3) """ import math import os from typing import Generator from core.ffmpeg.probe import probe_file from .exceptions import ChunkReadError from .models import Chunk class Chunker: """ Splits a media file into time-based chunks via a generator. Uses FFmpeg probe to get duration, then yields Chunk objects representing time segments (no data read — extraction happens in the processor). Args: file_path: Path to the source media file chunk_duration: Duration of each chunk in seconds (default: 10.0) """ def __init__( self, file_path: str, chunk_duration: float = 10.0, start_time: float | None = None, end_time: float | None = None, ): if not os.path.isfile(file_path): raise ChunkReadError(f"File not found: {file_path}") if chunk_duration <= 0: raise ValueError("chunk_duration must be positive") self.file_path = file_path self.chunk_duration = chunk_duration self.file_size = os.path.getsize(file_path) full_duration = self._probe_duration() # Apply time range self.range_start = max(start_time or 0.0, 0.0) self.range_end = min(end_time or full_duration, full_duration) if self.range_start >= self.range_end: raise ValueError( f"Invalid range: start={self.range_start} >= end={self.range_end}" ) self.source_duration = self.range_end - self.range_start def _probe_duration(self) -> float: """Get source file duration via FFmpeg probe.""" try: result = probe_file(self.file_path) if result.duration is None or result.duration <= 0: raise ChunkReadError( f"Cannot determine duration for {self.file_path}" ) return result.duration except ChunkReadError: raise except Exception as e: raise ChunkReadError( f"Failed to probe {self.file_path}: {e}" ) from e @property def expected_chunks(self) -> int: """Calculate expected number of chunks (last chunk may be shorter).""" if self.source_duration <= 0: return 0 return math.ceil(self.source_duration / self.chunk_duration) def chunks(self) -> Generator[Chunk, None, None]: """ Yield Chunk objects representing time segments of the source file. Generator-based: chunks are yielded on demand. Each chunk defines a time range — actual extraction is done by the processor. """ total = self.expected_chunks for sequence in range(total): start_time = self.range_start + sequence * self.chunk_duration end_time = min( start_time + self.chunk_duration, self.range_end ) duration = end_time - start_time yield Chunk( sequence=sequence, start_time=start_time, end_time=end_time, source_path=self.file_path, duration=duration, )