diff --git a/media/client/src/backends/subprocess.rs b/media/client/src/backends/subprocess.rs index 77a861c..418361c 100644 --- a/media/client/src/backends/subprocess.rs +++ b/media/client/src/backends/subprocess.rs @@ -23,6 +23,48 @@ use tracing::{error, info, warn}; use crate::encoder::{EncodedPacket, MediaType}; +/// Check if H.264 data contains an IDR (keyframe) NAL unit. +/// Scans for NAL start codes (00 00 01 or 00 00 00 01) and checks +/// the NAL type (lower 5 bits). Type 5 = IDR slice. +pub fn h264_is_keyframe(data: &[u8]) -> bool { + let mut i = 0; + while i + 3 < data.len() { + if data[i] == 0 && data[i + 1] == 0 { + let (nal_byte, skip) = if data[i + 2] == 1 { + (data.get(i + 3), 4) + } else if data[i + 2] == 0 && i + 4 < data.len() && data[i + 3] == 1 { + (data.get(i + 4), 5) + } else { + (None, 1) + }; + if let Some(&b) = nal_byte { + let nal_type = b & 0x1F; + if nal_type == 5 { + return true; + } + } + i += skip; + } else { + i += 1; + } + } + false +} + +/// Strip ADTS header from AAC data if present. Returns raw AAC frame. +/// ADTS header is 7 bytes (no CRC) or 9 bytes (with CRC). +fn strip_adts(data: &[u8]) -> Vec { + if data.len() >= 7 && data[0] == 0xFF && (data[1] & 0xF0) == 0xF0 { + let has_crc = (data[1] & 0x01) == 0; // protection_absent=0 means CRC present + let header_len = if has_crc { 9 } else { 7 }; + if data.len() > header_len { + return data[header_len..].to_vec(); + } + } + data.to_vec() +} + + pub struct SubprocessConfig { pub device: String, pub fps: u32, @@ -324,9 +366,14 @@ fn demux_and_send( } } else if let Some((audio_idx, audio_tb_num, audio_tb_den)) = audio_info { if stream_idx == audio_idx { + // Strip ADTS header if present — normalize to raw AAC on the wire. + // mpegts backends (e.g. gpu-screen-recorder) wrap AAC in ADTS; + // NUT (ffmpeg) sends raw AAC. Stripping here makes the wire + // format consistent regardless of capture backend. + let audio_data = strip_adts(&data); let encoded = EncodedPacket { media_type: MediaType::Audio, - data, + data: audio_data, pts: packet.pts().unwrap_or(0), dts: packet.dts().unwrap_or(0), keyframe: packet.is_key(), diff --git a/media/client/src/main.rs b/media/client/src/main.rs index 8033dd8..5b57a85 100644 --- a/media/client/src/main.rs +++ b/media/client/src/main.rs @@ -58,6 +58,7 @@ async fn main() -> Result<()> { sample_rate: 48000, channels: 2, codec: "aac".into(), + framing: "raw".into(), }, }; protocol::write_packet(&mut writer, &session_start.to_wire_packet()?).await?; diff --git a/media/common/src/protocol.rs b/media/common/src/protocol.rs index 17c9b10..d0876f6 100644 --- a/media/common/src/protocol.rs +++ b/media/common/src/protocol.rs @@ -166,6 +166,14 @@ pub struct AudioParams { pub sample_rate: u32, pub channels: u16, pub codec: String, + /// Audio framing on the wire: "raw" (no container headers) or "adts". + /// Default "raw" — client strips ADTS before sending. + #[serde(default = "default_framing")] + pub framing: String, +} + +fn default_framing() -> String { + "raw".into() } impl ControlMessage { @@ -231,6 +239,7 @@ mod tests { sample_rate: 48000, channels: 2, codec: "aac".into(), + framing: "raw".into(), }, }; let wire = msg.to_wire_packet().unwrap(); diff --git a/media/server/src/main.rs b/media/server/src/main.rs index 56fce18..996f2b9 100644 --- a/media/server/src/main.rs +++ b/media/server/src/main.rs @@ -98,9 +98,9 @@ async fn handle_client( info!("control: {ctrl:?}"); match ctrl { - ControlMessage::SessionStart { id, video, .. } => { + ControlMessage::SessionStart { id, video, audio } => { let s = tokio::task::block_in_place(|| { - Session::start(&id, &sessions_dir, video.fps) + Session::start(&id, &sessions_dir, video.fps, &audio) })?; session = Some(s); } diff --git a/media/server/src/session.rs b/media/server/src/session.rs index 39b4119..61e295e 100644 --- a/media/server/src/session.rs +++ b/media/server/src/session.rs @@ -19,6 +19,7 @@ use std::process::{Child, ChildStdin, Command, Stdio}; use std::thread; use anyhow::{Context, Result}; +use cht_common::protocol::AudioParams; use tokio::io::AsyncWriteExt; use tracing::{debug, info, warn}; @@ -33,6 +34,28 @@ struct ScenePacket { keyframe: bool, } +/// ADTS configuration derived from AudioParams at session start. +struct AdtsConfig { + /// Whether to wrap audio with ADTS headers (false if client sends ADTS). + wrap: bool, + sr_idx: u8, + ch_cfg: u8, +} + +impl AdtsConfig { + fn from_params(params: &AudioParams) -> Self { + let wrap = params.framing == "raw"; + let sr_idx = match params.sample_rate { + 96000 => 0, 88200 => 1, 64000 => 2, 48000 => 3, + 44100 => 4, 32000 => 5, 24000 => 6, 22050 => 7, + 16000 => 8, 12000 => 9, 11025 => 10, 8000 => 11, + _ => 3, // default 48kHz + }; + let ch_cfg = params.channels.min(7) as u8; + Self { wrap, sr_idx, ch_cfg } + } +} + pub struct Session { #[allow(dead_code)] session_dir: PathBuf, @@ -40,13 +63,14 @@ pub struct Session { ffmpeg: Child, video_stdin: Option, audio_file: Option, + audio_config: AdtsConfig, scene_tx: Option>, #[allow(dead_code)] fps: u32, } impl Session { - pub fn start(session_id: &str, sessions_dir: &Path, fps: u32) -> Result { + pub fn start(session_id: &str, sessions_dir: &Path, fps: u32, audio_params: &AudioParams) -> Result { let active_session_file = sessions_dir .parent() .unwrap_or(sessions_dir) @@ -134,6 +158,7 @@ impl Session { ffmpeg: child, video_stdin: Some(video_stdin), audio_file, + audio_config: AdtsConfig::from_params(audio_params), scene_tx: Some(scene_tx), fps, }) @@ -152,9 +177,13 @@ impl Session { pub fn write_audio(&mut self, data: &[u8]) -> Result<()> { if let Some(f) = &mut self.audio_file { - // Wrap raw AAC frame with ADTS header so the file is playable/parseable. - // Assumes AAC-LC, 48kHz, stereo (matches client's encoder config). - write_adts_frame(f, data)?; + if self.audio_config.wrap { + // Client sends raw AAC — wrap with ADTS using declared params. + write_adts_frame(f, data, &self.audio_config)?; + } else { + // Client sends ADTS-framed audio — write as-is. + f.write_all(data).context("write ADTS audio")?; + } } Ok(()) } @@ -295,27 +324,23 @@ async fn scene_relay_task( /// Write a raw AAC frame wrapped in a 7-byte ADTS header. /// -/// Fixed params: AAC-LC profile, 48 kHz sample rate, 2 channels (stereo). -/// These match the client's `-c:a aac -b:a 128k` default config. -fn write_adts_frame(w: &mut impl Write, aac_data: &[u8]) -> Result<()> { - // ADTS fixed header fields: - // profile: AAC-LC = 1 (stored as profile-1 = 0 in MPEG-4 ID mode) - // sample_rate: 48000 → index 3 - // channels: 2 → channel_configuration 2 - const PROFILE_MINUS1: u8 = 1; // AAC-LC - const SR_IDX: u8 = 3; // 48 kHz - const CH_CFG: u8 = 2; // stereo +/// Uses sample rate and channel count from the session's AudioParams +/// rather than hardcoded values, so any backend can declare its format. +fn write_adts_frame(w: &mut impl Write, aac_data: &[u8], cfg: &AdtsConfig) -> Result<()> { + const PROFILE_MINUS1: u8 = 1; // AAC-LC (object_type 2, stored as 2-1=1) - let frame_len = (aac_data.len() + 7) as u16; // total ADTS frame = header + payload + let sr_idx = cfg.sr_idx; + let ch_cfg = cfg.ch_cfg; + let frame_len = (aac_data.len() + 7) as u16; let header: [u8; 7] = [ // byte 0-1: syncword(12) | ID(1)=0(MPEG4) | layer(2)=0 | protection(1)=1(no CRC) 0xFF, 0xF1, // byte 2: profile(2) | sr_idx(4) | private(1)=0 | ch_cfg[2](1) - (PROFILE_MINUS1 << 6) | (SR_IDX << 2) | ((CH_CFG >> 2) & 1), + (PROFILE_MINUS1 << 6) | (sr_idx << 2) | ((ch_cfg >> 2) & 1), // byte 3: ch_cfg[1:0](2) | orig(1)=0 | home(1)=0 | copyright_id(1)=0 | copyright_start(1)=0 | frame_len[12:11](2) - ((CH_CFG & 3) << 6) | ((frame_len >> 11) as u8 & 0x03), + ((ch_cfg & 3) << 6) | ((frame_len >> 11) as u8 & 0x03), // byte 4: frame_len[10:3](8) ((frame_len >> 3) & 0xFF) as u8, // byte 5: frame_len[2:0](3) | buffer_fullness[10:6](5)