Files
mitus/cht/stream/ffmpeg.py

273 lines
10 KiB
Python

"""
Thin wrapper around ffmpeg-python for building and running ffmpeg pipelines.
All ffmpeg command construction goes through this module.
Uses ffmpeg-python's own run/run_async for subprocess management.
"""
import logging
import signal
import subprocess
import ffmpeg
log = logging.getLogger(__name__)
GLOBAL_ARGS = ("-hide_banner",)
# Note: scene detection needs -loglevel info for showinfo filter output.
# Individual pipelines can override with .global_args()
QUIET_ARGS = ("-hide_banner", "-loglevel", "warning")
def receive_and_record(stream_url, output_path):
"""Receive mpegts stream and write to MKV file.
MKV (Matroska) is used because:
- Handles incomplete writes gracefully (like OBS default)
- Proper timestamps for seeking and duration detection
- mpv plays growing MKV files better than mpegts
"""
stream = ffmpeg.input(stream_url, fflags="nobuffer", flags="low_delay")
return (
ffmpeg.output(
stream, str(output_path),
c="copy",
f="matroska",
flush_packets=1,
)
.global_args(*QUIET_ARGS)
)
def receive_record_and_relay(stream_url, output_path, relay_url):
"""Receive TCP stream, write to fragmented MP4, and relay to UDP loopback.
Fragmented MP4 (frag_keyframe+empty_moov) avoids MKV tail corruption:
each keyframe boundary closes a self-contained fragment, so the file is
always valid up to the last complete fragment (~1 keyframe interval ≈ 2s).
Uses ffmpeg tee via merge_outputs: one process, identical timestamps.
"""
stream = ffmpeg.input(stream_url, fflags="nobuffer", flags="low_delay")
file_out = ffmpeg.output(
stream, str(output_path),
c="copy", f="mp4",
movflags="frag_keyframe+empty_moov+default_base_moof",
flush_packets=1,
**{"bsf:a": "aac_adtstoasc"},
)
relay_out = ffmpeg.output(
stream, relay_url,
c="copy", f="mpegts",
)
return ffmpeg.merge_outputs(file_out, relay_out).global_args(*QUIET_ARGS)
def receive_record_relay_and_detect(stream_url, output_path, relay_url,
scene_threshold=0.10, flush_frames=2):
"""Single process: receive TCP → record fMP4 + relay UDP + scene detect.
One ffmpeg process, three output branches from the same TCP input:
1. File output — c=copy to fMP4 (raw packets, no decode)
2. UDP relay — c=copy to mpegts for live display (raw packets)
3. Scene frames — CUDA decode (GPU) → select(scene) + showinfo (CPU)
→ MJPEG piped to stdout
Scene frames are piped to stdout as image2pipe/mjpeg to avoid the image2
muxer's one-frame buffering delay. The caller reads JPEG data from stdout
and writes files itself. Stderr carries showinfo lines with timestamps.
Both stdout and stderr must be read continuously.
"""
stream = ffmpeg.input(
stream_url, fflags="nobuffer", flags="low_delay",
hwaccel="cuda",
)
# Copy outputs (raw packet remux, no decode)
file_out = ffmpeg.output(
stream, str(output_path),
c="copy", f="mp4",
movflags="frag_keyframe+empty_moov+default_base_moof",
flush_packets=1,
**{"bsf:a": "aac_adtstoasc"},
)
relay_out = ffmpeg.output(
stream, relay_url,
c="copy", f="mpegts",
)
# Scene detection: CUDA decode (GPU) → select filter (CPU, lightweight)
# → showinfo → MJPEG piped to stdout
#
scene_expr = f"gt(scene,{scene_threshold})"
if flush_frames > 0:
# Flush trick: select extra frames after each scene change to push
# the real frame through the encoder+muxer buffer pipeline.
# mod(selected_n, 1+flush_frames) prevents chaining.
mod_val = 1 + flush_frames
flush_expr = f"eq(n,prev_selected_n+1)*mod(selected_n,{mod_val})"
select_expr = f"{scene_expr}+{flush_expr}"
else:
select_expr = scene_expr
scene_stream = stream.filter("select", select_expr).filter("showinfo")
scene_out = ffmpeg.output(
scene_stream, "pipe:1",
f="image2pipe", vcodec="mjpeg",
flush_packets=1, **{"q:v": "2", "fps_mode": "passthrough"},
)
return ffmpeg.merge_outputs(file_out, relay_out, scene_out).global_args(*GLOBAL_ARGS)
def extract_scene_frames(input_path, output_dir, scene_threshold=0.10,
start_number=1, start_time=0.0, duration=None):
"""Extract frames from a file on scene change only (no interval fallback).
Frames are a chronological storyboard — captured whenever content changes
meaningfully vs the previous frame. No periodic fallback so static content
produces no spurious frames.
start_time/duration: applied via the select filter expression (NOT as -ss/-t
input options, which break scene detection on fragmented MP4).
Returns (stdout, stderr) as decoded strings for timestamp parsing.
"""
scene_expr = f"gt(scene,{scene_threshold})"
time_conditions = []
if start_time > 0:
time_conditions.append(f"gte(t,{start_time})")
if duration is not None:
time_conditions.append(f"lte(t,{start_time + duration})")
if time_conditions:
time_filter = "*".join(time_conditions)
select_expr = f"({scene_expr})*{time_filter}"
else:
select_expr = scene_expr
# CUDA hardware decode — GPU does h264 parsing, frames auto-transfer
# to CPU for the scene filter. Falls back to software if unavailable.
stream = ffmpeg.input(str(input_path), hwaccel="cuda")
stream = stream.filter("select", select_expr).filter("showinfo")
output = (
ffmpeg.output(
stream,
str(output_dir / "F%04d.jpg"),
vsync="vfr",
**{"q:v": "2"},
start_number=start_number,
)
.global_args(*GLOBAL_ARGS)
)
log.info("extract_scene_frames: %s", " ".join(output.compile()))
try:
stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
except ffmpeg.Error as e:
# ffmpeg may exit non-zero on growing files (corrupt tail) but still
# produce valid frames. Return the stderr for parsing anyway.
stderr = e.stderr or b""
err_text = stderr.decode("utf-8", errors="replace")
# Log the last meaningful line so we can see the real cause
for line in reversed(err_text.splitlines()):
if line.strip() and not line.startswith(" "):
log.debug("ffmpeg scene error: %s", line.strip())
break
stdout = e.stdout or b""
return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
def detect_scenes_from_pipe(scene_threshold=0.10, flush_frames=2, fps=30):
"""Scene-detect from piped raw H.264 on stdin. Returns a node for run_async.
Used when Rust server provides a live H.264 stream via Unix socket.
Caller bridges the socket to ffmpeg's stdin and reads stdout/stderr:
- stdin: raw H.264 from the socket
- stdout: MJPEG pipe (JPEG frames on scene change)
- stderr: showinfo lines with pts_time timestamps
"""
stream = ffmpeg.input(
"pipe:0", f="h264", framerate=fps, hwaccel="cuda",
fflags="nobuffer", probesize=32, analyzeduration=0,
)
scene_expr = f"gt(scene,{scene_threshold})"
if flush_frames > 0:
mod_val = 1 + flush_frames
flush_expr = f"eq(n,prev_selected_n+1)*mod(selected_n,{mod_val})"
select_expr = f"{scene_expr}+{flush_expr}"
else:
select_expr = scene_expr
scene_stream = stream.filter("select", select_expr).filter("showinfo")
return ffmpeg.output(
scene_stream, "pipe:1",
f="image2pipe", vcodec="mjpeg",
flush_packets=1, strict="unofficial",
**{"q:v": "2", "fps_mode": "passthrough"},
).global_args(*GLOBAL_ARGS)
def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None):
"""Extract audio from recording as 16kHz mono WAV (optimal for Whisper).
Uses input-level seeking (-ss before -i) for fast keyframe-based seek.
Supports fMP4 (auto-detect) and raw AAC files (explicit format hint).
Returns (stdout, stderr) as decoded strings.
"""
kwargs = {"ss": start_time}
if duration is not None:
kwargs["t"] = duration
# Raw AAC files need explicit format hint
if str(input_path).endswith(".aac"):
kwargs["f"] = "aac"
stream = ffmpeg.input(str(input_path), **kwargs)
output = (
ffmpeg.output(
stream, str(output_path),
acodec="pcm_s16le", ac=1, ar=16000,
vn=None,
)
.overwrite_output()
.global_args(*QUIET_ARGS)
)
log.info("extract_audio_chunk: %s", " ".join(output.compile()))
try:
stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
except ffmpeg.Error as e:
stderr = e.stderr or b""
log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1])
stdout = e.stdout or b""
return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
def extract_frame_at(input_path, output_path, timestamp):
"""Extract a single frame at the given timestamp."""
output = (
ffmpeg.input(str(input_path), ss=timestamp, hwaccel="cuda")
.output(str(output_path), vframes=1, **{"q:v": "2"})
.overwrite_output()
.global_args(*QUIET_ARGS)
)
log.info("extract_frame_at: %s", " ".join(output.compile()))
output.run(capture_stdout=True, capture_stderr=True)
def run_async(output_node, pipe_stdin=False, pipe_stdout=False, pipe_stderr=False):
"""Start an ffmpeg pipeline asynchronously via ffmpeg-python's run_async."""
log.info("run_async: %s", " ".join(output_node.compile()))
return output_node.run_async(
pipe_stdin=pipe_stdin,
pipe_stdout=pipe_stdout,
pipe_stderr=pipe_stderr,
)
def stop_proc(proc, timeout=5):
"""Gracefully stop an ffmpeg subprocess."""
if proc and proc.poll() is None:
proc.send_signal(signal.SIGINT)
try:
proc.wait(timeout=timeout)
except subprocess.TimeoutExpired:
proc.kill()