87 lines
2.8 KiB
Python
87 lines
2.8 KiB
Python
"""
|
|
Chunker — probes a media file and yields time-based Chunk objects.
|
|
|
|
Demonstrates:
|
|
- Function parameters and defaults (Interview Topic 1)
|
|
- List comprehensions and efficient iteration / generators (Interview Topic 3)
|
|
"""
|
|
|
|
import math
|
|
import os
|
|
from typing import Generator
|
|
|
|
from core.ffmpeg.probe import probe_file
|
|
|
|
from .exceptions import ChunkReadError
|
|
from .models import Chunk
|
|
|
|
|
|
class Chunker:
|
|
"""
|
|
Splits a media file into time-based chunks via a generator.
|
|
|
|
Uses FFmpeg probe to get duration, then yields Chunk objects
|
|
representing time segments (no data read — extraction happens in the processor).
|
|
|
|
Args:
|
|
file_path: Path to the source media file
|
|
chunk_duration: Duration of each chunk in seconds (default: 10.0)
|
|
"""
|
|
|
|
def __init__(self, file_path: str, chunk_duration: float = 10.0):
|
|
if not os.path.isfile(file_path):
|
|
raise ChunkReadError(f"File not found: {file_path}")
|
|
if chunk_duration <= 0:
|
|
raise ValueError("chunk_duration must be positive")
|
|
|
|
self.file_path = file_path
|
|
self.chunk_duration = chunk_duration
|
|
self.file_size = os.path.getsize(file_path)
|
|
self.source_duration = self._probe_duration()
|
|
|
|
def _probe_duration(self) -> float:
|
|
"""Get source file duration via FFmpeg probe."""
|
|
try:
|
|
result = probe_file(self.file_path)
|
|
if result.duration is None or result.duration <= 0:
|
|
raise ChunkReadError(
|
|
f"Cannot determine duration for {self.file_path}"
|
|
)
|
|
return result.duration
|
|
except ChunkReadError:
|
|
raise
|
|
except Exception as e:
|
|
raise ChunkReadError(
|
|
f"Failed to probe {self.file_path}: {e}"
|
|
) from e
|
|
|
|
@property
|
|
def expected_chunks(self) -> int:
|
|
"""Calculate expected number of chunks (last chunk may be shorter)."""
|
|
if self.source_duration <= 0:
|
|
return 0
|
|
return math.ceil(self.source_duration / self.chunk_duration)
|
|
|
|
def chunks(self) -> Generator[Chunk, None, None]:
|
|
"""
|
|
Yield Chunk objects representing time segments of the source file.
|
|
|
|
Generator-based: chunks are yielded on demand.
|
|
Each chunk defines a time range — actual extraction is done by the processor.
|
|
"""
|
|
total = self.expected_chunks
|
|
for sequence in range(total):
|
|
start_time = sequence * self.chunk_duration
|
|
end_time = min(
|
|
start_time + self.chunk_duration, self.source_duration
|
|
)
|
|
duration = end_time - start_time
|
|
|
|
yield Chunk(
|
|
sequence=sequence,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
source_path=self.file_path,
|
|
duration=duration,
|
|
)
|