normalize media pipeline at client boundary
- AudioParams.framing field: client declares "raw" or "adts" - Client strips ADTS from audio before sending (strip_adts) - Client does H.264 NAL inspection for keyframe detection (h264_is_keyframe) - Server uses declared sample_rate/channels for ADTS synthesis instead of hardcoded 48kHz/stereo - Server gates ADTS wrapping on framing field instead of per-packet sniffing New backends only need to pipe output to demux_and_send() — server and Python unchanged. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -23,6 +23,48 @@ use tracing::{error, info, warn};
|
|||||||
|
|
||||||
use crate::encoder::{EncodedPacket, MediaType};
|
use crate::encoder::{EncodedPacket, MediaType};
|
||||||
|
|
||||||
|
/// Check if H.264 data contains an IDR (keyframe) NAL unit.
|
||||||
|
/// Scans for NAL start codes (00 00 01 or 00 00 00 01) and checks
|
||||||
|
/// the NAL type (lower 5 bits). Type 5 = IDR slice.
|
||||||
|
pub fn h264_is_keyframe(data: &[u8]) -> bool {
|
||||||
|
let mut i = 0;
|
||||||
|
while i + 3 < data.len() {
|
||||||
|
if data[i] == 0 && data[i + 1] == 0 {
|
||||||
|
let (nal_byte, skip) = if data[i + 2] == 1 {
|
||||||
|
(data.get(i + 3), 4)
|
||||||
|
} else if data[i + 2] == 0 && i + 4 < data.len() && data[i + 3] == 1 {
|
||||||
|
(data.get(i + 4), 5)
|
||||||
|
} else {
|
||||||
|
(None, 1)
|
||||||
|
};
|
||||||
|
if let Some(&b) = nal_byte {
|
||||||
|
let nal_type = b & 0x1F;
|
||||||
|
if nal_type == 5 {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i += skip;
|
||||||
|
} else {
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Strip ADTS header from AAC data if present. Returns raw AAC frame.
|
||||||
|
/// ADTS header is 7 bytes (no CRC) or 9 bytes (with CRC).
|
||||||
|
fn strip_adts(data: &[u8]) -> Vec<u8> {
|
||||||
|
if data.len() >= 7 && data[0] == 0xFF && (data[1] & 0xF0) == 0xF0 {
|
||||||
|
let has_crc = (data[1] & 0x01) == 0; // protection_absent=0 means CRC present
|
||||||
|
let header_len = if has_crc { 9 } else { 7 };
|
||||||
|
if data.len() > header_len {
|
||||||
|
return data[header_len..].to_vec();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data.to_vec()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
pub struct SubprocessConfig {
|
pub struct SubprocessConfig {
|
||||||
pub device: String,
|
pub device: String,
|
||||||
pub fps: u32,
|
pub fps: u32,
|
||||||
@@ -324,9 +366,14 @@ fn demux_and_send(
|
|||||||
}
|
}
|
||||||
} else if let Some((audio_idx, audio_tb_num, audio_tb_den)) = audio_info {
|
} else if let Some((audio_idx, audio_tb_num, audio_tb_den)) = audio_info {
|
||||||
if stream_idx == audio_idx {
|
if stream_idx == audio_idx {
|
||||||
|
// Strip ADTS header if present — normalize to raw AAC on the wire.
|
||||||
|
// mpegts backends (e.g. gpu-screen-recorder) wrap AAC in ADTS;
|
||||||
|
// NUT (ffmpeg) sends raw AAC. Stripping here makes the wire
|
||||||
|
// format consistent regardless of capture backend.
|
||||||
|
let audio_data = strip_adts(&data);
|
||||||
let encoded = EncodedPacket {
|
let encoded = EncodedPacket {
|
||||||
media_type: MediaType::Audio,
|
media_type: MediaType::Audio,
|
||||||
data,
|
data: audio_data,
|
||||||
pts: packet.pts().unwrap_or(0),
|
pts: packet.pts().unwrap_or(0),
|
||||||
dts: packet.dts().unwrap_or(0),
|
dts: packet.dts().unwrap_or(0),
|
||||||
keyframe: packet.is_key(),
|
keyframe: packet.is_key(),
|
||||||
|
|||||||
@@ -58,6 +58,7 @@ async fn main() -> Result<()> {
|
|||||||
sample_rate: 48000,
|
sample_rate: 48000,
|
||||||
channels: 2,
|
channels: 2,
|
||||||
codec: "aac".into(),
|
codec: "aac".into(),
|
||||||
|
framing: "raw".into(),
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
protocol::write_packet(&mut writer, &session_start.to_wire_packet()?).await?;
|
protocol::write_packet(&mut writer, &session_start.to_wire_packet()?).await?;
|
||||||
|
|||||||
@@ -166,6 +166,14 @@ pub struct AudioParams {
|
|||||||
pub sample_rate: u32,
|
pub sample_rate: u32,
|
||||||
pub channels: u16,
|
pub channels: u16,
|
||||||
pub codec: String,
|
pub codec: String,
|
||||||
|
/// Audio framing on the wire: "raw" (no container headers) or "adts".
|
||||||
|
/// Default "raw" — client strips ADTS before sending.
|
||||||
|
#[serde(default = "default_framing")]
|
||||||
|
pub framing: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_framing() -> String {
|
||||||
|
"raw".into()
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ControlMessage {
|
impl ControlMessage {
|
||||||
@@ -231,6 +239,7 @@ mod tests {
|
|||||||
sample_rate: 48000,
|
sample_rate: 48000,
|
||||||
channels: 2,
|
channels: 2,
|
||||||
codec: "aac".into(),
|
codec: "aac".into(),
|
||||||
|
framing: "raw".into(),
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
let wire = msg.to_wire_packet().unwrap();
|
let wire = msg.to_wire_packet().unwrap();
|
||||||
|
|||||||
@@ -98,9 +98,9 @@ async fn handle_client(
|
|||||||
info!("control: {ctrl:?}");
|
info!("control: {ctrl:?}");
|
||||||
|
|
||||||
match ctrl {
|
match ctrl {
|
||||||
ControlMessage::SessionStart { id, video, .. } => {
|
ControlMessage::SessionStart { id, video, audio } => {
|
||||||
let s = tokio::task::block_in_place(|| {
|
let s = tokio::task::block_in_place(|| {
|
||||||
Session::start(&id, &sessions_dir, video.fps)
|
Session::start(&id, &sessions_dir, video.fps, &audio)
|
||||||
})?;
|
})?;
|
||||||
session = Some(s);
|
session = Some(s);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ use std::process::{Child, ChildStdin, Command, Stdio};
|
|||||||
use std::thread;
|
use std::thread;
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
|
use cht_common::protocol::AudioParams;
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
use tracing::{debug, info, warn};
|
use tracing::{debug, info, warn};
|
||||||
|
|
||||||
@@ -33,6 +34,28 @@ struct ScenePacket {
|
|||||||
keyframe: bool,
|
keyframe: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// ADTS configuration derived from AudioParams at session start.
|
||||||
|
struct AdtsConfig {
|
||||||
|
/// Whether to wrap audio with ADTS headers (false if client sends ADTS).
|
||||||
|
wrap: bool,
|
||||||
|
sr_idx: u8,
|
||||||
|
ch_cfg: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AdtsConfig {
|
||||||
|
fn from_params(params: &AudioParams) -> Self {
|
||||||
|
let wrap = params.framing == "raw";
|
||||||
|
let sr_idx = match params.sample_rate {
|
||||||
|
96000 => 0, 88200 => 1, 64000 => 2, 48000 => 3,
|
||||||
|
44100 => 4, 32000 => 5, 24000 => 6, 22050 => 7,
|
||||||
|
16000 => 8, 12000 => 9, 11025 => 10, 8000 => 11,
|
||||||
|
_ => 3, // default 48kHz
|
||||||
|
};
|
||||||
|
let ch_cfg = params.channels.min(7) as u8;
|
||||||
|
Self { wrap, sr_idx, ch_cfg }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct Session {
|
pub struct Session {
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
session_dir: PathBuf,
|
session_dir: PathBuf,
|
||||||
@@ -40,13 +63,14 @@ pub struct Session {
|
|||||||
ffmpeg: Child,
|
ffmpeg: Child,
|
||||||
video_stdin: Option<ChildStdin>,
|
video_stdin: Option<ChildStdin>,
|
||||||
audio_file: Option<File>,
|
audio_file: Option<File>,
|
||||||
|
audio_config: AdtsConfig,
|
||||||
scene_tx: Option<tokio::sync::mpsc::Sender<ScenePacket>>,
|
scene_tx: Option<tokio::sync::mpsc::Sender<ScenePacket>>,
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
fps: u32,
|
fps: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Session {
|
impl Session {
|
||||||
pub fn start(session_id: &str, sessions_dir: &Path, fps: u32) -> Result<Self> {
|
pub fn start(session_id: &str, sessions_dir: &Path, fps: u32, audio_params: &AudioParams) -> Result<Self> {
|
||||||
let active_session_file = sessions_dir
|
let active_session_file = sessions_dir
|
||||||
.parent()
|
.parent()
|
||||||
.unwrap_or(sessions_dir)
|
.unwrap_or(sessions_dir)
|
||||||
@@ -134,6 +158,7 @@ impl Session {
|
|||||||
ffmpeg: child,
|
ffmpeg: child,
|
||||||
video_stdin: Some(video_stdin),
|
video_stdin: Some(video_stdin),
|
||||||
audio_file,
|
audio_file,
|
||||||
|
audio_config: AdtsConfig::from_params(audio_params),
|
||||||
scene_tx: Some(scene_tx),
|
scene_tx: Some(scene_tx),
|
||||||
fps,
|
fps,
|
||||||
})
|
})
|
||||||
@@ -152,9 +177,13 @@ impl Session {
|
|||||||
|
|
||||||
pub fn write_audio(&mut self, data: &[u8]) -> Result<()> {
|
pub fn write_audio(&mut self, data: &[u8]) -> Result<()> {
|
||||||
if let Some(f) = &mut self.audio_file {
|
if let Some(f) = &mut self.audio_file {
|
||||||
// Wrap raw AAC frame with ADTS header so the file is playable/parseable.
|
if self.audio_config.wrap {
|
||||||
// Assumes AAC-LC, 48kHz, stereo (matches client's encoder config).
|
// Client sends raw AAC — wrap with ADTS using declared params.
|
||||||
write_adts_frame(f, data)?;
|
write_adts_frame(f, data, &self.audio_config)?;
|
||||||
|
} else {
|
||||||
|
// Client sends ADTS-framed audio — write as-is.
|
||||||
|
f.write_all(data).context("write ADTS audio")?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -295,27 +324,23 @@ async fn scene_relay_task(
|
|||||||
|
|
||||||
/// Write a raw AAC frame wrapped in a 7-byte ADTS header.
|
/// Write a raw AAC frame wrapped in a 7-byte ADTS header.
|
||||||
///
|
///
|
||||||
/// Fixed params: AAC-LC profile, 48 kHz sample rate, 2 channels (stereo).
|
/// Uses sample rate and channel count from the session's AudioParams
|
||||||
/// These match the client's `-c:a aac -b:a 128k` default config.
|
/// rather than hardcoded values, so any backend can declare its format.
|
||||||
fn write_adts_frame(w: &mut impl Write, aac_data: &[u8]) -> Result<()> {
|
fn write_adts_frame(w: &mut impl Write, aac_data: &[u8], cfg: &AdtsConfig) -> Result<()> {
|
||||||
// ADTS fixed header fields:
|
const PROFILE_MINUS1: u8 = 1; // AAC-LC (object_type 2, stored as 2-1=1)
|
||||||
// profile: AAC-LC = 1 (stored as profile-1 = 0 in MPEG-4 ID mode)
|
|
||||||
// sample_rate: 48000 → index 3
|
|
||||||
// channels: 2 → channel_configuration 2
|
|
||||||
const PROFILE_MINUS1: u8 = 1; // AAC-LC
|
|
||||||
const SR_IDX: u8 = 3; // 48 kHz
|
|
||||||
const CH_CFG: u8 = 2; // stereo
|
|
||||||
|
|
||||||
let frame_len = (aac_data.len() + 7) as u16; // total ADTS frame = header + payload
|
let sr_idx = cfg.sr_idx;
|
||||||
|
let ch_cfg = cfg.ch_cfg;
|
||||||
|
let frame_len = (aac_data.len() + 7) as u16;
|
||||||
|
|
||||||
let header: [u8; 7] = [
|
let header: [u8; 7] = [
|
||||||
// byte 0-1: syncword(12) | ID(1)=0(MPEG4) | layer(2)=0 | protection(1)=1(no CRC)
|
// byte 0-1: syncword(12) | ID(1)=0(MPEG4) | layer(2)=0 | protection(1)=1(no CRC)
|
||||||
0xFF,
|
0xFF,
|
||||||
0xF1,
|
0xF1,
|
||||||
// byte 2: profile(2) | sr_idx(4) | private(1)=0 | ch_cfg[2](1)
|
// byte 2: profile(2) | sr_idx(4) | private(1)=0 | ch_cfg[2](1)
|
||||||
(PROFILE_MINUS1 << 6) | (SR_IDX << 2) | ((CH_CFG >> 2) & 1),
|
(PROFILE_MINUS1 << 6) | (sr_idx << 2) | ((ch_cfg >> 2) & 1),
|
||||||
// byte 3: ch_cfg[1:0](2) | orig(1)=0 | home(1)=0 | copyright_id(1)=0 | copyright_start(1)=0 | frame_len[12:11](2)
|
// byte 3: ch_cfg[1:0](2) | orig(1)=0 | home(1)=0 | copyright_id(1)=0 | copyright_start(1)=0 | frame_len[12:11](2)
|
||||||
((CH_CFG & 3) << 6) | ((frame_len >> 11) as u8 & 0x03),
|
((ch_cfg & 3) << 6) | ((frame_len >> 11) as u8 & 0x03),
|
||||||
// byte 4: frame_len[10:3](8)
|
// byte 4: frame_len[10:3](8)
|
||||||
((frame_len >> 3) & 0xFF) as u8,
|
((frame_len >> 3) & 0xFF) as u8,
|
||||||
// byte 5: frame_len[2:0](3) | buffer_fullness[10:6](5)
|
// byte 5: frame_len[2:0](3) | buffer_fullness[10:6](5)
|
||||||
|
|||||||
Reference in New Issue
Block a user