""" Chunker — probes a media file and yields time-based Chunk objects. Demonstrates: - Function parameters and defaults (Interview Topic 1) - List comprehensions and efficient iteration / generators (Interview Topic 3) """ import math import os from typing import Generator from core.ffmpeg.probe import probe_file from .exceptions import ChunkReadError from .models import Chunk class Chunker: """ Splits a media file into time-based chunks via a generator. Uses FFmpeg probe to get duration, then yields Chunk objects representing time segments (no data read — extraction happens in the processor). Args: file_path: Path to the source media file chunk_duration: Duration of each chunk in seconds (default: 10.0) """ def __init__(self, file_path: str, chunk_duration: float = 10.0): if not os.path.isfile(file_path): raise ChunkReadError(f"File not found: {file_path}") if chunk_duration <= 0: raise ValueError("chunk_duration must be positive") self.file_path = file_path self.chunk_duration = chunk_duration self.file_size = os.path.getsize(file_path) self.source_duration = self._probe_duration() def _probe_duration(self) -> float: """Get source file duration via FFmpeg probe.""" try: result = probe_file(self.file_path) if result.duration is None or result.duration <= 0: raise ChunkReadError( f"Cannot determine duration for {self.file_path}" ) return result.duration except ChunkReadError: raise except Exception as e: raise ChunkReadError( f"Failed to probe {self.file_path}: {e}" ) from e @property def expected_chunks(self) -> int: """Calculate expected number of chunks (last chunk may be shorter).""" if self.source_duration <= 0: return 0 return math.ceil(self.source_duration / self.chunk_duration) def chunks(self) -> Generator[Chunk, None, None]: """ Yield Chunk objects representing time segments of the source file. Generator-based: chunks are yielded on demand. Each chunk defines a time range — actual extraction is done by the processor. """ total = self.expected_chunks for sequence in range(total): start_time = sequence * self.chunk_duration end_time = min( start_time + self.chunk_duration, self.source_duration ) duration = end_time - start_time yield Chunk( sequence=sequence, start_time=start_time, end_time=end_time, source_path=self.file_path, duration=duration, )