From 946234eb9ee929ea9e395c2039f72dfe0d87ebf3 Mon Sep 17 00:00:00 2001 From: buenosairesam Date: Wed, 6 May 2026 11:51:43 -0300 Subject: [PATCH] update docs --- docs/README.md | 34 + {media/docs => docs/graphs}/crates.dot | 4 +- {media/docs => docs/graphs}/crates.svg | 2 +- docs/graphs/python_pipeline.dot | 86 +++ docs/graphs/python_pipeline.svg | 308 ++++++++++ .../graphs/rust_client.dot | 6 +- .../graphs/rust_client.svg | 4 +- .../graphs/rust_server.dot | 11 +- docs/graphs/rust_server.svg | 263 ++++++++ docs/graphs/system.dot | 77 +++ docs/graphs/system.svg | 262 ++++++++ docs/index.html | 581 ++++++++++++++++++ media/ctrl/docs.sh => docs/render.sh | 11 +- docs/viewer.html | 97 +++ media/docs/index.html | 193 ------ media/docs/server-pipeline.svg | 286 --------- 16 files changed, 1723 insertions(+), 502 deletions(-) create mode 100644 docs/README.md rename {media/docs => docs/graphs}/crates.dot (87%) rename {media/docs => docs/graphs}/crates.svg (99%) create mode 100644 docs/graphs/python_pipeline.dot create mode 100644 docs/graphs/python_pipeline.svg rename media/docs/client-pipeline.dot => docs/graphs/rust_client.dot (97%) rename media/docs/client-pipeline.svg => docs/graphs/rust_client.svg (99%) rename media/docs/server-pipeline.dot => docs/graphs/rust_server.dot (87%) create mode 100644 docs/graphs/rust_server.svg create mode 100644 docs/graphs/system.dot create mode 100644 docs/graphs/system.svg create mode 100644 docs/index.html rename media/ctrl/docs.sh => docs/render.sh (53%) create mode 100644 docs/viewer.html delete mode 100644 media/docs/index.html delete mode 100644 media/docs/server-pipeline.svg diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..dd5d9f6 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,34 @@ +# Mitus — Documentation + +## View + +``` +cd docs && python3 -m http.server 8000 +``` + +Then open in a browser. + +## Re-render diagrams + +After editing any `graphs/*.dot` file: + +``` +./render.sh +``` + +Requires `graphviz` (`sudo apt install graphviz`). + +## Layout + +``` +docs/ +├── index.html main page (overview, diagrams, walkthroughs) +├── viewer.html pan/zoom viewer for individual SVGs +├── render.sh regenerate all SVGs from .dot sources +└── graphs/ + ├── system.{dot,svg} top-level architecture + ├── python_pipeline.{dot,svg} Python transport (default) + ├── rust_client.{dot,svg} Rust client (sender) + ├── rust_server.{dot,svg} Rust server (receiver) + └── crates.{dot,svg} Rust workspace crates +``` diff --git a/media/docs/crates.dot b/docs/graphs/crates.dot similarity index 87% rename from media/docs/crates.dot rename to docs/graphs/crates.dot index 7ba3933..bee8be4 100644 --- a/media/docs/crates.dot +++ b/docs/graphs/crates.dot @@ -1,4 +1,4 @@ -// Cargo workspace crate dependency graph +// Mitus — Rust transport workspace (media/) crate dependency graph digraph crates { graph [fontname="monospace" bgcolor="#1e1e2e" pad="0.5"] node [fontname="monospace" fontcolor="#cdd6f4" style=filled shape=box @@ -21,7 +21,7 @@ digraph crates { client [label="cht-client [sender, Wayland]\n─────────────────────────────\nbackends/subprocess.rs ffmpeg CLI + PulseAudio\n NUT demux → EncodedPacket\nbackends/mod.rs Backend enum\ncapture.rs KmsCapture (direct backend)\nencoder.rs VaapiEncoder + MediaType\npipeline.rs capture→encode thread\nmain.rs wait_for_server, transport,\n YYYYMMDD_HHMMSS session IDs" fillcolor="#1e2d3e" color="#89b4fa"] - server [label="cht-server [receiver, mcrndeb]\n─────────────────────────────\nmain.rs TCP listener\n routes Video/Audio/Control\nsession.rs ffmpeg subprocess:\n fMP4 + UDP relay\n ADTS audio writer\n Scene relay (Unix socket)\n keyframe buffering" + server [label="cht-server [receiver, mcrn]\n─────────────────────────────\nmain.rs TCP listener\n routes Video/Audio/Control\nsession.rs ffmpeg subprocess:\n fMP4 + UDP relay\n ADTS audio writer\n Scene relay (Unix socket)\n keyframe buffering" fillcolor="#1e2d3e" color="#89b4fa"] // Deps diff --git a/media/docs/crates.svg b/docs/graphs/crates.svg similarity index 99% rename from media/docs/crates.svg rename to docs/graphs/crates.svg index e92494a..354c975 100644 --- a/media/docs/crates.svg +++ b/docs/graphs/crates.svg @@ -170,7 +170,7 @@ server -cht-server  [receiver, mcrndeb] +cht-server  [receiver, mcrn] ───────────────────────────── main.rs       TCP listener              routes Video/Audio/Control diff --git a/docs/graphs/python_pipeline.dot b/docs/graphs/python_pipeline.dot new file mode 100644 index 0000000..d5521db --- /dev/null +++ b/docs/graphs/python_pipeline.dot @@ -0,0 +1,86 @@ +// Mitus — Python transport pipeline (default mode, --python or no flag) +// Sender bash script wraps ffmpeg CLI; receiver runs ffmpeg in-process via Python. +digraph python_pipeline { + graph [fontname="monospace" bgcolor="#1e1e2e" rankdir=TB pad="0.6" splines=polyline] + node [fontname="monospace" fontcolor="#cdd6f4" style=filled shape=box + fillcolor="#313244" color="#585b70" margin="0.25,0.12"] + edge [color="#585b70" fontname="monospace" fontcolor="#a6adc8"] + + // Hardware / OS + drm [label="/dev/dri/card0\n(KMS scanout)" shape=cylinder fillcolor="#1e3a2f" color="#a6e3a1"] + pulse [label="PulseAudio\n─────────────\nmonitor: default sink\nmic: webcam (C922)" shape=cylinder fillcolor="#1e3a2f" color="#a6e3a1"] + net [label="TCP :4444\nmpegts" shape=parallelogram fillcolor="#1e2a3e" color="#89b4fa"] + + subgraph cluster_sender { + label="Sender — sender/stream_av.sh" fontcolor="#a6adc8" color="#45475a" fontname="monospace" + + watchdog [label="watchdog loop\n─────────────\nffmpeg restart on stall\n(total_size or frame stuck > 10s)\nimmediate restart on\nDRM plane format change" + fillcolor="#2d2038" color="#cba6f7"] + + ffmpeg_send [label="ffmpeg CLI\n─────────────\nkmsgrab → hwmap=vaapi\nscale_vaapi 1920x1080 nv12\nh264_vaapi (qp=20, gop=30, no B-frames)\namix(monitor, mic) → aac 128k\nmpegts → TCP" + fillcolor="#1e2d3e" color="#89b4fa"] + } + + subgraph cluster_recorder { + label="StreamRecorder — cht/stream/recorder.py" fontcolor="#a6adc8" color="#45475a" fontname="monospace" + + ffmpeg_recv [label="ffmpeg listener\n─────────────\nlisten=1 on TCP :4444\n→ 2 outputs:\n fragmented MP4 (recording_*.mp4)\n UDP :4445 (mpegts → mpv)\n stdout pipe (showinfo)" + fillcolor="#1e2d3e" color="#89b4fa"] + + scene_pipe [label="scene-detect parser\n─────────────\nreads stdout pipe\nshowinfo → scene timestamps\nemits raw_frame(jpeg, ts)" + fillcolor="#2d2038" color="#cba6f7"] + } + + subgraph cluster_processor { + label="SessionProcessor — cht/stream/processor.py" fontcolor="#a6adc8" color="#45475a" fontname="monospace" + + frame_writer [label="frame writer\n─────────────\nwrites JPEG to frames/\nappends to index.json\nfires on_new_frames(ts, path)" + fillcolor="#2d2038" color="#cba6f7"] + + audio_extract [label="audio extractor\n─────────────\npolls fMP4 for new audio\nffmpeg → 16 kHz mono WAV\nchunks for transcription" + fillcolor="#2d2038" color="#cba6f7"] + + tracker [label="RecordingTracker\n─────────────\nffprobe duration\nsums segments\nfeeds timeline UI" + fillcolor="#2d2038" color="#cba6f7"] + } + + transcriber [label="TranscriberEngine\n─────────────\ncht/transcriber/engine.py\nfaster-whisper (CUDA)\ngrouped segments → transcript.json" + fillcolor="#2d2038" color="#cba6f7"] + + gui [label="Mitus GUI (GTK4)\n─────────────\nMonitor (mpv UDP)\nScrub bar · Frames · Transcript\nAgent input/output" + fillcolor="#2d2038" color="#cba6f7"] + + // Outputs + fmp4 [label="stream/\nrecording_*.mp4" shape=folder fillcolor="#2a2a3e" color="#585b70"] + udp [label="UDP :4445\n→ mpv" shape=parallelogram fillcolor="#2a2a3e" color="#585b70"] + frames [label="frames/\nindex.json + *.jpg" shape=folder fillcolor="#2a2a3e" color="#585b70"] + audio [label="audio/\nchunk_*.wav" shape=folder fillcolor="#2a2a3e" color="#585b70"] + txt [label="transcript.json" shape=folder fillcolor="#2a2a3e" color="#585b70"] + + // Flow — sender + drm -> ffmpeg_send [label="kmsgrab"] + pulse -> ffmpeg_send [label="-f pulse"] + watchdog -> ffmpeg_send [style=dashed label="restart"] + ffmpeg_send -> net + + // Flow — recorder + net -> ffmpeg_recv [label="mpegts"] + ffmpeg_recv -> fmp4 + ffmpeg_recv -> udp + ffmpeg_recv -> scene_pipe [label="stdout"] + udp -> gui [label="live\nmonitor"] + + // Flow — processor + scene_pipe -> frame_writer [label="raw_frame"] + frame_writer -> frames + fmp4 -> audio_extract [label="poll" style=dashed] + audio_extract -> audio + audio -> transcriber [label="WAV"] + transcriber -> txt + fmp4 -> tracker [label="ffprobe" style=dashed] + tracker -> gui [label="duration"] + + // Flow — GUI + frames -> gui + txt -> gui +} diff --git a/docs/graphs/python_pipeline.svg b/docs/graphs/python_pipeline.svg new file mode 100644 index 0000000..4f3289a --- /dev/null +++ b/docs/graphs/python_pipeline.svg @@ -0,0 +1,308 @@ + + + + + + +python_pipeline + + +cluster_sender + +Sender — sender/stream_av.sh + + +cluster_recorder + +StreamRecorder — cht/stream/recorder.py + + +cluster_processor + +SessionProcessor — cht/stream/processor.py + + + +drm + + +/dev/dri/card0 +(KMS scanout) + + + +ffmpeg_send + +ffmpeg CLI +───────────── +kmsgrab → hwmap=vaapi +scale_vaapi 1920x1080 nv12 +h264_vaapi (qp=20, gop=30, no B-frames) +amix(monitor, mic) → aac 128k +mpegts → TCP + + + +drm->ffmpeg_send + + +kmsgrab + + + +pulse + + +PulseAudio +───────────── +monitor: default sink +mic: webcam (C922) + + + +pulse->ffmpeg_send + + +-f pulse + + + +net + +TCP :4444 +mpegts + + + +ffmpeg_recv + +ffmpeg listener +───────────── +listen=1 on TCP :4444 +→ 2 outputs: +  fragmented MP4 (recording_*.mp4) +  UDP :4445 (mpegts → mpv) +  stdout pipe (showinfo) + + + +net->ffmpeg_recv + + +mpegts + + + +watchdog + +watchdog loop +───────────── +ffmpeg restart on stall +(total_size or frame stuck > 10s) +immediate restart on +DRM plane format change + + + +watchdog->ffmpeg_send + + +restart + + + +ffmpeg_send->net + + + + + +scene_pipe + +scene-detect parser +───────────── +reads stdout pipe +showinfo → scene timestamps +emits raw_frame(jpeg, ts) + + + +ffmpeg_recv->scene_pipe + + +stdout + + + +fmp4 + +stream/ +recording_*.mp4 + + + +ffmpeg_recv->fmp4 + + + + + +udp + +UDP :4445 +→ mpv + + + +ffmpeg_recv->udp + + + + + +frame_writer + +frame writer +───────────── +writes JPEG to frames/ +appends to index.json +fires on_new_frames(ts, path) + + + +scene_pipe->frame_writer + + +raw_frame + + + +frames + +frames/ +index.json + *.jpg + + + +frame_writer->frames + + + + + +audio_extract + +audio extractor +───────────── +polls fMP4 for new audio +ffmpeg → 16 kHz mono WAV +chunks for transcription + + + +audio + +audio/ +chunk_*.wav + + + +audio_extract->audio + + + + + +tracker + +RecordingTracker +───────────── +ffprobe duration +sums segments +feeds timeline UI + + + +gui + +Mitus GUI (GTK4) +───────────── +Monitor (mpv UDP) +Scrub bar · Frames · Transcript +Agent input/output + + + +tracker->gui + + +duration + + + +transcriber + +TranscriberEngine +───────────── +cht/transcriber/engine.py +faster-whisper (CUDA) +grouped segments → transcript.json + + + +txt + +transcript.json + + + +transcriber->txt + + + + + +fmp4->audio_extract + + +poll + + + +fmp4->tracker + + +ffprobe + + + +udp->gui + + +live +monitor + + + +frames->gui + + + + + +audio->transcriber + + +WAV + + + +txt->gui + + + + + diff --git a/media/docs/client-pipeline.dot b/docs/graphs/rust_client.dot similarity index 97% rename from media/docs/client-pipeline.dot rename to docs/graphs/rust_client.dot index b93ea6d..fb29f95 100644 --- a/media/docs/client-pipeline.dot +++ b/docs/graphs/rust_client.dot @@ -1,10 +1,10 @@ -// Client pipeline data flow +// Mitus — Rust client (sender) pipeline — media/client/ // Sender machine (Wayland, VAAPI GPU) -digraph client_pipeline { +digraph rust_client { graph [fontname="monospace" bgcolor="#1e1e2e" rankdir=TB pad="0.6" splines=polyline] node [fontname="monospace" fontcolor="#cdd6f4" style=filled shape=box fillcolor="#313244" color="#585b70" margin="0.25,0.12"] - edge [color="#585b70" fontname="monospace" fontcolor="#a6adc8" labelfontname="monospace"] + edge [color="#585b70" fontname="monospace" fontcolor="#a6adc8"] // Hardware drm [label="/dev/dri/card0\n(KMS scanout)" shape=cylinder fillcolor="#1e3a2f" color="#a6e3a1"] diff --git a/media/docs/client-pipeline.svg b/docs/graphs/rust_client.svg similarity index 99% rename from media/docs/client-pipeline.svg rename to docs/graphs/rust_client.svg index 067204d..8ae44f9 100644 --- a/media/docs/client-pipeline.svg +++ b/docs/graphs/rust_client.svg @@ -3,11 +3,11 @@ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> - + -client_pipeline +rust_client cluster_main diff --git a/media/docs/server-pipeline.dot b/docs/graphs/rust_server.dot similarity index 87% rename from media/docs/server-pipeline.dot rename to docs/graphs/rust_server.dot index 9bbc0a3..d8ab85b 100644 --- a/media/docs/server-pipeline.dot +++ b/docs/graphs/rust_server.dot @@ -1,6 +1,6 @@ -// Server pipeline — current implementation -// Receiver machine (mcrndeb: X11, RTX 3080, NVDEC) -digraph server_pipeline { +// Mitus — Rust server (receiver) pipeline — media/server/ +// Receiver machine (mcrn: X11, RTX 3080, NVDEC) +digraph rust_server { graph [fontname="monospace" bgcolor="#1e1e2e" rankdir=TB pad="0.6" splines=polyline] node [fontname="monospace" fontcolor="#cdd6f4" style=filled shape=box fillcolor="#313244" color="#585b70" margin="0.25,0.12"] @@ -68,9 +68,4 @@ digraph server_pipeline { // Python reads files aac_file -> audio_extract [label="reads" style=dashed] active_session -> python [label="discovers\nsession dir" style=dashed] - - // Known regression (2026-04-10) - regression [label="⚠ REGRESSED\n─────────────\nScene relay (separate pipe)\nbreaks 'one behind' flush.\ntry_send drops → decoder\ncorruption until keyframe.\n\nFix: move scene detection\ninto server ffmpeg as 3rd\noutput branch (10-scene-\ndetect-to-rust.md)" - shape=note fillcolor="#3d1e1e" color="#f38ba8" fontcolor="#f38ba8"] - scene_relay -> regression [style=dashed color="#f38ba8"] } diff --git a/docs/graphs/rust_server.svg b/docs/graphs/rust_server.svg new file mode 100644 index 0000000..52a7f4c --- /dev/null +++ b/docs/graphs/rust_server.svg @@ -0,0 +1,263 @@ + + + + + + +rust_server + + +cluster_rust + +cht-server (Rust) + + +cluster_python + +Python (cht app) + + + +net + +TCP :4447 +(WirePacket) + + + +listener + +Listener +───────────── +TCP accept +reads WirePacket +routes by type: +  Video → ffmpeg + scene relay +  Audio → ADTS file +  Control → session lifecycle + + + +net->listener + + +WirePacket + + + +python + +Python GUI +(cht app) + + + +ffmpeg_rec + +ffmpeg subprocess +───────────── +H.264 pipe:0 → 2 outputs: +  1. fMP4 (frag_keyframe) +  2. UDP :4445 (mpegts) + + + +listener->ffmpeg_rec + + +H.264 video + + + +scene_relay + +Scene Relay +───────────── +Unix socket (scene.sock) +buffers latest keyframe +best-effort: drops if slow +100ms write timeout + + + +listener->scene_relay + + +H.264 copy ++ keyframe flag + + + +audio_writer + +Audio Writer +───────────── +ADTS header + raw AAC +→ stream/audio.aac + + + +listener->audio_writer + + +AAC audio + + + +active_session + + + +active-session +───────────── +file at data/active-session +Python polls to discover +session dir + + + +listener->active_session + + +on SessionStart + + + +fmp4 + +stream/ +recording_000.mp4 +(fragmented MP4) + + + +ffmpeg_rec->fmp4 + + +copy + + + +udp_live + +UDP :4445 +(mpegts → mpv) + + + +ffmpeg_rec->udp_live + + +copy + + + +scene_ffmpeg + +Scene Detector +───────────── +connects to scene.sock +pipes H.264 → ffmpeg: +  CUDA decode +  select=gt(scene,thresh) +  showinfo → timestamps +  MJPEG → JPEG frames + + + +scene_relay->scene_ffmpeg + + +raw H.264 +(Unix socket) + + + +aac_file + +stream/ +audio.aac +(ADTS-wrapped) + + + +audio_writer->aac_file + + + + + +active_session->python + + +discovers +session dir + + + +frames + +frames/ +index.json + *.jpg + + + +scene_ffmpeg->frames + + +JPEG on +scene change + + + +audio_extract + +Audio Extractor +───────────── +reads audio.aac +ffmpeg → 16kHz mono WAV +chunks + transcript WAVs + + + +audio_dir + +audio/ +chunk_*.wav +transcript_*.wav + + + +audio_extract->audio_dir + + + + + +transcriber + +Transcriber +───────────── +faster-whisper (CUDA) +segment grouping +slider: chunk size + lines/group + + + +aac_file->audio_extract + + +reads + + + +audio_dir->transcriber + + +WAV chunks + + + diff --git a/docs/graphs/system.dot b/docs/graphs/system.dot new file mode 100644 index 0000000..9de6a1b --- /dev/null +++ b/docs/graphs/system.dot @@ -0,0 +1,77 @@ +// Mitus — top-level architecture +// Sender (Wayland, VAAPI) → network → Receiver (X11, NVDEC/NVENC) → Mitus GUI app +// Two transport modes share the same recording layout and same GUI. +digraph system { + graph [fontname="monospace" bgcolor="#1e1e2e" rankdir=LR pad="0.6" splines=polyline nodesep=0.5 ranksep=0.8] + node [fontname="monospace" fontcolor="#cdd6f4" style=filled shape=box + fillcolor="#313244" color="#585b70" margin="0.25,0.14"] + edge [color="#585b70" fontname="monospace" fontcolor="#a6adc8"] + + subgraph cluster_sender { + label="Sender machine — Wayland, VAAPI GPU" fontcolor="#a6adc8" color="#45475a" fontname="monospace" + + capture_py [label="kmsgrab + PulseAudio\n─────────────\nsender/stream_av.sh\nffmpeg CLI · h264_vaapi · AAC\nmpegts over TCP" fillcolor="#2d2038" color="#cba6f7"] + + capture_rs [label="cht-client (Rust)\n─────────────\nmedia/client/\nffmpeg subprocess (subprocess backend)\nNUT demux → mpsc → WirePacket TCP" fillcolor="#1e2d3e" color="#89b4fa"] + } + + subgraph cluster_net { + label="Network" fontcolor="#a6adc8" color="#45475a" fontname="monospace" + net_py [label="TCP :4444\nmpegts" shape=parallelogram fillcolor="#1e2a3e" color="#89b4fa"] + net_rs [label="TCP :4447\nWirePacket framing" shape=parallelogram fillcolor="#1e2a3e" color="#89b4fa"] + } + + subgraph cluster_receiver { + label="Receiver (mcrn) — X11, NVENC/NVDEC GPU" fontcolor="#a6adc8" color="#45475a" fontname="monospace" + + recorder_py [label="StreamRecorder (Python)\n─────────────\ncht/stream/recorder.py\nffmpeg listener · TCP receive\nfMP4 writer · UDP relay\nstdout-pipe scene detect" + fillcolor="#2d2038" color="#cba6f7"] + + recorder_rs [label="cht-server (Rust)\n─────────────\nmedia/server/\nWirePacket router\nfMP4 + UDP relay (ffmpeg)\nADTS audio writer\nUnix-socket scene relay" + fillcolor="#1e2d3e" color="#89b4fa"] + + processor [label="SessionProcessor (Python)\n─────────────\ncht/stream/processor.py\nfMP4 → audio.wav (ffmpeg)\nchunked WAVs for transcribe\n[Rust mode: scene detect via\nUnix socket → ffmpeg pipe]" + fillcolor="#2d2038" color="#cba6f7"] + + transcriber [label="Transcriber\n─────────────\ncht/transcriber/engine.py\nfaster-whisper · CUDA\nsegment grouping" + fillcolor="#2d2038" color="#cba6f7"] + + gui [label="Mitus GUI (GTK4 + libadwaita)\n─────────────\ncht/window.py · cht/ui/*\nMonitor (mpv UDP) · Scrub bar\nFrames panel · Transcript panel\nAgent input/output" + fillcolor="#2d2038" color="#cba6f7"] + + agent [label="Agent runner\n─────────────\ncht/agent/*\nClaude SDK · OpenAI/Groq\n@F frame refs · @T transcript refs" + fillcolor="#2d2038" color="#cba6f7"] + + store [label="data//\n─────────────\nstream/recording_*.mp4\nstream/audio.aac (Rust mode)\nframes/*.jpg + index.json\naudio/chunk_*.wav\ntranscript.json · thread.json" + shape=folder fillcolor="#2a2a3e" color="#585b70"] + } + + // Python transport flow + capture_py -> net_py [color="#cba6f7"] + net_py -> recorder_py [color="#cba6f7"] + recorder_py -> store [color="#cba6f7"] + recorder_py -> processor [label="raw scene\nframes" color="#cba6f7"] + + // Rust transport flow + capture_rs -> net_rs [color="#89b4fa"] + net_rs -> recorder_rs [color="#89b4fa"] + recorder_rs -> store [color="#89b4fa"] + recorder_rs -> processor [label="scene.sock\n(H.264)" style=dashed color="#a6e3a1"] + + // Shared downstream + store -> processor [style=dashed] + processor -> transcriber [label="WAV chunks"] + transcriber -> store [label="transcript.json"] + store -> gui [label="files + watchers"] + gui -> agent [label="@-mentions"] + agent -> store [label="thread.json" style=dashed] + + // Legend + subgraph cluster_legend { + label="Legend" fontcolor="#a6adc8" color="#585b70" fontname="monospace" + l_py [label="Python" fillcolor="#2d2038" color="#cba6f7"] + l_rs [label="Rust" fillcolor="#1e2d3e" color="#89b4fa"] + l_io [label="I/O · network" shape=parallelogram fillcolor="#1e2a3e" color="#89b4fa"] + l_fs [label="filesystem" shape=folder fillcolor="#2a2a3e" color="#585b70"] + } +} diff --git a/docs/graphs/system.svg b/docs/graphs/system.svg new file mode 100644 index 0000000..6693b7c --- /dev/null +++ b/docs/graphs/system.svg @@ -0,0 +1,262 @@ + + + + + + +system + + +cluster_sender + +Sender machine — Wayland, VAAPI GPU + + +cluster_net + +Network + + +cluster_receiver + +Receiver (mcrn) — X11, NVENC/NVDEC GPU + + +cluster_legend + +Legend + + + +capture_py + +kmsgrab + PulseAudio +───────────── +sender/stream_av.sh +ffmpeg CLI · h264_vaapi · AAC +mpegts over TCP + + + +net_py + +TCP :4444 +mpegts + + + +capture_py->net_py + + + + + +capture_rs + +cht-client (Rust) +───────────── +media/client/ +ffmpeg subprocess (subprocess backend) +NUT demux → mpsc → WirePacket TCP + + + +net_rs + +TCP :4447 +WirePacket framing + + + +capture_rs->net_rs + + + + + +recorder_py + +StreamRecorder (Python) +───────────── +cht/stream/recorder.py +ffmpeg listener · TCP receive +fMP4 writer · UDP relay +stdout-pipe scene detect + + + +net_py->recorder_py + + + + + +recorder_rs + +cht-server (Rust) +───────────── +media/server/ +WirePacket router +fMP4 + UDP relay (ffmpeg) +ADTS audio writer +Unix-socket scene relay + + + +net_rs->recorder_rs + + + + + +processor + +SessionProcessor (Python) +───────────── +cht/stream/processor.py +fMP4 → audio.wav (ffmpeg) +chunked WAVs for transcribe +[Rust mode: scene detect via +Unix socket → ffmpeg pipe] + + + +recorder_py->processor + + +raw scene +frames + + + +store + +data/<session_id>/ +───────────── +stream/recording_*.mp4 +stream/audio.aac (Rust mode) +frames/*.jpg + index.json +audio/chunk_*.wav +transcript.json · thread.json + + + +recorder_py->store + + + + + +recorder_rs->processor + + +scene.sock +(H.264) + + + +recorder_rs->store + + + + + +transcriber + +Transcriber +───────────── +cht/transcriber/engine.py +faster-whisper · CUDA +segment grouping + + + +processor->transcriber + + +WAV chunks + + + +transcriber->store + + +transcript.json + + + +gui + +Mitus GUI (GTK4 + libadwaita) +───────────── +cht/window.py · cht/ui/* +Monitor (mpv UDP) · Scrub bar +Frames panel · Transcript panel +Agent input/output + + + +agent + +Agent runner +───────────── +cht/agent/* +Claude SDK · OpenAI/Groq +@F frame refs · @T transcript refs + + + +gui->agent + + +@-mentions + + + +agent->store + + +thread.json + + + +store->processor + + + + + +store->gui + + +files + watchers + + + +l_py + +Python + + + +l_rs + +Rust + + + +l_io + +I/O · network + + + +l_fs + +filesystem + + + diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..14dfd1d --- /dev/null +++ b/docs/index.html @@ -0,0 +1,581 @@ + + + + + +Mitus — Architecture + + + + +
+

MITUS

+ Stream viewer + agent — architecture + +
+ +
+ + + + +
+ +
+

GOAL & WALKTHROUGH

+

Mitus records a remote desktop, transcribes its audio, extracts scene-change frames, and exposes both to an LLM agent for ad-hoc Q&A.

+
+ +

What it is

+

A two-machine setup: the sender (a Wayland desktop) captures screen + audio and ships an encoded stream to the receiver. The receiver records to disk, runs scene detection on the live feed to extract per-event JPEG frames, transcribes the audio, and presents the result in a GTK4 GUI. The GUI doubles as an LLM client: select a frame or transcript span, hit Enter, and an agent (Claude SDK or any OpenAI-compatible endpoint) answers using the selected media as context.

+ +

Why the split

+

Capture wants Wayland + a VAAPI-friendly GPU; analysis wants CUDA for both faster-whisper and ffmpeg scene detection. Different machines, different drivers — the network stream is the seam. The receiver also runs the GUI because the recordings are stored locally and the agent talks to large frames as files, not blobs over a wire.

+ +

Two transport modes

+

Both modes produce the same on-disk session layout (data/<session_id>/stream/, frames/, audio/, transcript.json) so the GUI doesn't care which path the bytes took. The choice is a CLI flag.

+
    +
  • Python (default). Sender is a bash watchdog wrapping ffmpeg CLI. Receiver is cht/stream/recorder.py: an ffmpeg listener that writes fragmented MP4 + relays UDP to mpv + emits scene frames out of an showinfo stdout pipe. Simple, all in one process, every restart costs a few seconds.
  • +
  • Rust (--rust). A standalone Rust workspace under media/: cht-client on the sender, cht-server on the receiver. Wire protocol is a typed WirePacket framing instead of raw mpegts. Scene detection still runs in Python via a Unix-socket relay from the server. Connect time drops from ~20s to ~3s; session reload from disk is 1–2s.
  • +
+
The media/ directory holds the Rust transport. While both modes coexist, that name is a misnomer — a future rename is planned. For now, "Rust transport" and "media/" mean the same thing.
+ +

What the agent sees

+

Two reference syntaxes resolve to media when sent: @F0001@F0042 for frames, @T0001@T0010 for transcript segments. Single-word verbs describe and answer are sent verbatim — no system prompt, no boilerplate. If you want detail, you type it. The agent runner injects only the referenced frame paths and transcript text alongside the user message.

+ +
+
+ +
+

USAGE

+

How to start a session — sender side, receiver side, both transports.

+
+ +

Both ctrl/client.sh and ctrl/app.sh take a transport flag — --python (default) or --rust. The ctrl/ wrappers are the entrypoints; media/ctrl/* and sender/stream_av.py are implementation details they dispatch to.

+ +

Receiver (mcrn) — GUI

+

Python transport (default):

+
./ctrl/app.sh --python
+

Rust transport:

+
./ctrl/server.sh         # cht-server on TCP :4447 (Rust mode only)
+./ctrl/app.sh --rust
+

Python mode does its own TCP listening inside the GUI process — no separate server step.

+ +

Sender

+

Python transport:

+
./ctrl/client.sh --python [RECEIVER_IP] [PORT]   # default port 4444
+

(Runs sudo python3 sender/stream_av.py under the hood — sudo is required for kmsgrab.)

+

Rust transport:

+
./ctrl/client.sh --rust [server_addr]            # default mcrndeb:4447
+ +

Sync

+

Both machines share the same source tree; ctrl/sync.sh rsyncs from the dev host to mcrndeb. The receiver's filesystem is also bind-mounted at ~/mcrn on the dev host for quick file access.

+ +

Inside the GUI

+
    +
  • Frames panel — click to select; ←/→ navigate.
  • +
  • Transcript panel — click to select; ↑/↓ navigate; Shift to extend.
  • +
  • Enter — sends answer + selected refs to the agent.
  • +
  • Describe / Answer buttons — same idea, single-word verb prepended.
  • +
  • Agent input — type freely; @F1-3 and @T5 attach refs.
  • +
  • Esc — clear selection. Del — clear agent output.
  • +
  • Ctrl+R — manual segment cut.
  • +
+ +

Agent provider

+

Resolution order in cht/agent/runner.py:

+
    +
  • GROQ_API_KEY → OpenAI-compatible client against Groq.
  • +
  • OPENAI_API_KEY → OpenAI / OpenAI-compatible.
  • +
  • (default) → Claude Code SDK using your local CC subscription.
  • +
+ +
+
+ +
+

SYSTEM ARCHITECTURE

+

End-to-end view: sender capture → network → receiver record + analyse → GUI + agent. Both transports converge on the same on-disk session layout.

+
+ System architecture +
+
+ Python + Rust + Hardware / external + Filesystem +
+
+ +
+

PYTHON PIPELINE

+

Default mode. Bash + ffmpeg CLI on the sender; StreamRecorder + SessionProcessor in cht/stream/ on the receiver. Scene detection rides the recorder's ffmpeg stdout pipe — sub-second latency, no extra process.

+
+ Python pipeline +
+
+ Python module + External binary (ffmpeg) + Hardware / OS source + Filesystem output +
+
+ +
+

RUST CLIENT — sender

+

media/client/ — replaces sender/stream_av.sh when running with --rust. Two backends: subprocess (default, wraps ffmpeg CLI) and an experimental direct VAAPI capture/encoder.

+
+ Rust client pipeline +
+
+ +
+

RUST SERVER — receiver

+

media/server/ — replaces StreamRecorder when running with --rust. TCP listener with a typed WirePacket framing; routes Video/Audio/Control packets to ffmpeg recording, ADTS audio, and a Unix-socket scene relay.

+
+ Rust server pipeline +
+
+ +
+

RUST CRATES

+

Cargo workspace under media/: three crates (cht-common, cht-client, cht-server) and their external deps. Designed to be reusable as a standalone tool — mpr is expected to depend on it too.

+
+ Rust crates +
+
+ +
+

REPOSITORY STRUCTURE

+

Top-level layout. Python app under cht/; Rust transport under media/; sender bash under sender/; ops scripts under ctrl/.

+
+
cht/
+├── cht/                    Python app (GTK4 GUI, recording, transcribe, agent)
+│   ├── app.py · window.py     entrypoint + main window
+│   ├── config.py · session.py app config, session manifest
+│   ├── stream/                recorder · processor · tracker · lifecycle · ffmpeg helpers
+│   ├── audio/                 waveform engine
+│   ├── transcriber/           faster-whisper engine
+│   ├── scrub/                 proxy manager (scrub-mode preview)
+│   ├── index/                 frame index helpers
+│   ├── agent/                 runner · base · tools · claude_sdk_connection · openai_connection
+│   └── ui/                    timeline · monitor · scrub_bar · frames_panel · transcript_panel
+│                              agent_input · agent_output · markdown · keyboard · mpv · waveform
+├── media/                  Rust transport workspace (Cargo) — to be renamed once both modes coexist
+│   ├── common/                cht-common  — WirePacket, ControlMessage, logging
+│   ├── client/                cht-client  — sender (Wayland, VAAPI)
+│   ├── server/                cht-server  — receiver (TCP listener, ffmpeg fan-out)
+│   └── ctrl/                  build.sh · client.sh · server.sh
+├── sender/                 Python-mode sender — stream_av.sh (bash watchdog around ffmpeg CLI)
+├── ctrl/                   app.sh · server.sh · client.sh · sync.sh · bench.py · e2e_test.sh
+├── tests/                  pytest suites — config · ffmpeg · manager · processor · timeline · tracker
+├── data/                   runtime — sessions, active-session pointer (gitignored)
+├── logs/                   runtime logs (gitignored)
+├── docs/                   this site — index.html · viewer.html · graphs/ · render.sh
+└── pyproject.toml · uv.lock   Python deps via uv
+
+
+ +
+

DESIGN NOTES

+

Why some non-obvious choices look the way they do.

+
+ +

Same on-disk layout from both transports

+

The GUI, transcript, scene index, and agent never branch on transport mode — they only read files. The recording layout is the contract; the network protocol underneath is replaceable. This is what made the Rust port feasible without rewriting the analysis side.

+ +

Scene detection lives in the recorder, not the processor

+

In Python mode, scene-change frames come straight off the recorder's ffmpeg stdout pipe — sub-second, single process. Polling the fragmented MP4 from a separate process would add 3–5 s of disk-IPC latency. In Rust mode the same property is approximated by relaying raw H.264 over scene.sock to a separate ffmpeg, but that relay turns out to be the source of most current scene-detection pain (see The scene detection saga below).

+ +

Why bother with the Rust port

+

Two measured wins drove the work: connect time dropped from ~20 s (CLI ffmpeg startup + mpegts negotiation) to ~3 s (typed handshake), and session reload from disk dropped to 1–2 s. The Python recorder still works fine for development; the Rust path matters when you reconnect a lot.

+ +

One-word verbs, no system prompt

+

Pressing Enter sends answer + selected refs verbatim. There is no system prompt and no instruction template wrapping the message. If a question needs detail, the user types it — the model sees exactly what you'd see, not a contract you'd have to debug.

+ +

Subprocess backend over a custom encoder

+

The Rust client wraps the same ffmpeg CLI the Python sender uses, demuxes its NUT output in-process, and ships EncodedPackets. Less code to own than a direct VAAPI encode path, and it inherits ffmpeg's robustness around odd Wayland/DRM transitions. The direct VAAPI backend exists but is experimental.

+ +

Sender as a watchdog, not a daemon

+

Python-mode stream_av.sh is a bash loop that restarts ffmpeg on stall (no progress for 10 s) and restarts immediately on the DRM-plane format change that fullscreen apps trigger. Cheaper and more reliable than building stall detection into a long-lived process.

+ +

Struggles — the scene detection saga

+

Scene detection is the part of the system that has fought back the hardest. The short version: scene detection wants to live in the same ffmpeg process that does the decoding, and every architecture change has had to relearn that.

+ +

1. The "one behind" bug and the flush trick

+

Original Python pipeline ran scene detection as a branch of the same ffmpeg that records: select='gt(scene,T)'showinfo → MJPEG. The MJPEG encoder + muxer holds the selected frame in its internal buffer until another selected frame pushes it out — so the JPEG you receive at time T is actually the previous scene change, not the current one. Classic "one behind".

+

Workaround: a flush trick — select extra adjacent frames after each scene change so the real frame gets pushed through immediately (SCENE_FLUSH_FRAMES, see cht/config.py, used in cht/stream/ffmpeg.py :: receive_record_relay_and_detect). Worked reliably only because everything was in one ffmpeg process.

+ +

2. The Rust relay broke it

+

When transport moved to Rust, the recorder split into two processes: Rust-side ffmpeg writes fMP4 + UDP, and a separate Python-side ffmpeg consumes raw H.264 from scene.sock for scene detection. Two new failure modes appeared:

+
    +
  • The flush trick stopped flushing. The MJPEG encoder behaves differently in a standalone pipe-fed ffmpeg vs. as a branch of a multi-output process — adjacent extra frames no longer reliably push the previous selection through.
  • +
  • Decoder corruption from dropped packets. The Rust relay uses try_send with a 100 ms socket write timeout (media/server/src/session.rs). On any backpressure the relay drops H.264 packets, which corrupts the downstream decoder until the next keyframe — and missed keyframes mean missed scene detections.
  • +
+ +

3. Three dead ends

+
    +
  • fMP4-tip extraction. Trigger on showinfo, then extract the frame from the just-written fragmented MP4. Fragments only finalize at keyframe boundaries (~2 s with GOP 30), so ffprobe reports stale duration and the extracted frame comes from the previous scene.
  • +
  • Single Rust ffmpeg with mixed outputs. The clean fix would be one ffmpeg in Rust doing record (-c:v copy) + relay (-c:v copy) + scene detect (decode + filter). It doesn't work — ffmpeg won't mix -c:v copy outputs with -filter_complex on a pipe input under -hwaccel cuda.
  • +
  • Tighter retry intervals on the extractor. Dropping retry from 1 s to 0.3 s made things worse — concurrent ffmpeg processes thrashing the GPU rather than completing.
  • +
+ +

4. Where it actually landed

+

Current working approach (Rust mode): the relay-fed scene detector fires showinfo with a timestamp, then Python extracts the frame from the recording file at that timestamp, with a wall-clock offset computed from the session-dir name. Reliable frames; ~1 s latency per scene from fMP4 fragment lag plus the per-extract ffmpeg spawn (~0.5 s). It's the system limping along until the proper fix lands. See def/10-scene-detect-to-rust.md and def/ISSUES.md R1, R3 for the full record.

+
Lesson. The flush hack is a dead end in any pipe-fed context. Don't try to make it work over relay — move scene detection back into the same process that has the decoded frames. That's the only configuration that has ever been quiet.
+ +

Future work

+ +

Near term — scene detection as a 3rd output of the Rust server's ffmpeg

+

Spec: def/10-scene-detect-to-rust.md. Add a third branch to the existing ffmpeg the Rust server already runs:

+
    +
  • Output 1: -c:v copy → fMP4 (unchanged)
  • +
  • Output 2: -c:v copy → UDP relay (unchanged)
  • +
  • Output 3: CUDA decode → select='gt(scene,T)'showinfo → MJPEG out a second pipe / second Unix socket
  • +
+

This restores the single-process invariant — scene detection sees the same decoded frames as the recording branch, the flush behavior matches, no relay packet drops. Removes detect_scenes_from_pipe() in cht/stream/ffmpeg.py, the stdin-feeder thread in cht/stream/processor.py, and scene_relay_task in media/server/src/session.rs.

+

Adjacent improvements once that lands:

+
    +
  • Long-running extractor. Keep one ffmpeg open and pipe seek commands rather than spawning per frame — eliminates the ~0.5 s startup hit.
  • +
  • PTS on the wire. Have the Rust server send recording PTS alongside scene events so Python doesn't have to guess a wall-clock offset from the session-dir name (which is also why the first scene frame currently lands 7–10 s late in Rust mode — def/ISSUES.md R1).
  • +
+ +

End goal — in-process libav filter graph

+

Spec: def/09-media-transport.md. Rust server decodes via NVDEC, runs the scene filter in-process via the libav API, and writes JPEGs directly. No ffmpeg subprocess, no pipe, no relay, no extraction — scene-to-frame latency drops to near zero. The 3rd-output step above is the bridge: same single-process discipline, easier to land, and a clean rewrite target once it works.

+

Other items deferred to that broader port:

+
    +
  • Frame buffer / fast scrub. GPU ring buffer of the last N decoded frames exposed over shared memory to the Python scrub UI — replaces the mpv proxy MJPEG hack (see def/07-scrub-perf-ceiling.md).
  • +
  • Typed control protocol. The current WirePacket framing covers session lifecycle but not parameter changes; spec 09 sketches a control-message channel for things like live scene_threshold updates and reconnect-with-PTS.
  • +
  • Audio in the live UDP relay. Rust mode currently has no audio in the live monitor (def/ISSUES.md R2) because the server's ffmpeg only takes video on its stdin. Resolved naturally once the server's ffmpeg also receives the audio track.
  • +
+ +
+
+ +
+ +
+ + + + + diff --git a/media/ctrl/docs.sh b/docs/render.sh similarity index 53% rename from media/ctrl/docs.sh rename to docs/render.sh index 39162a6..05ad125 100755 --- a/media/ctrl/docs.sh +++ b/docs/render.sh @@ -1,21 +1,18 @@ #!/bin/bash # Re-render all Graphviz diagrams to SVG. -# Run this after each phase when .dot files are updated. -# Usage: ./docs.sh +# Run after editing any .dot file under docs/graphs/. +# Usage: ./render.sh set -euo pipefail -DOCS_DIR="$(cd "$(dirname "$0")/../docs" && pwd)" +GRAPHS_DIR="$(cd "$(dirname "$0")/graphs" && pwd)" if ! command -v dot &>/dev/null; then echo "graphviz not found — install with: sudo apt install graphviz" >&2 exit 1 fi -for f in "$DOCS_DIR"/*.dot; do +for f in "$GRAPHS_DIR"/*.dot; do svg="${f%.dot}.svg" echo "==> $(basename "$f") → $(basename "$svg")" dot -Tsvg "$f" -o "$svg" done - -echo "==> done. Serving at http://localhost:9099 (ctrl-c to stop)" -cd "$DOCS_DIR" && python3 -m http.server 9099 diff --git a/docs/viewer.html b/docs/viewer.html new file mode 100644 index 0000000..2a44b5b --- /dev/null +++ b/docs/viewer.html @@ -0,0 +1,97 @@ + + + + +Graph Viewer + + + +
+ +
+ + + diff --git a/media/docs/index.html b/media/docs/index.html deleted file mode 100644 index d5a83aa..0000000 --- a/media/docs/index.html +++ /dev/null @@ -1,193 +0,0 @@ - - - - -Media Transport — Architecture - - - - - - - - -
-
-

Select a diagram

- -
-
-

← pick a diagram from the sidebar

-
-
- - - - diff --git a/media/docs/server-pipeline.svg b/media/docs/server-pipeline.svg deleted file mode 100644 index bc5be7a..0000000 --- a/media/docs/server-pipeline.svg +++ /dev/null @@ -1,286 +0,0 @@ - - - - - - -server_pipeline - - -cluster_rust - -cht-server (Rust) - - -cluster_python - -Python (cht app) - - - -net - -TCP :4447 -(WirePacket) - - - -listener - -Listener -───────────── -TCP accept -reads WirePacket -routes by type: -  Video → ffmpeg + scene relay -  Audio → ADTS file -  Control → session lifecycle - - - -net->listener - - -WirePacket - - - -python - -Python GUI -(cht app) - - - -ffmpeg_rec - -ffmpeg subprocess -───────────── -H.264 pipe:0 → 2 outputs: -  1. fMP4 (frag_keyframe) -  2. UDP :4445 (mpegts) - - - -listener->ffmpeg_rec - - -H.264 video - - - -scene_relay - -Scene Relay -───────────── -Unix socket (scene.sock) -buffers latest keyframe -best-effort: drops if slow -100ms write timeout - - - -listener->scene_relay - - -H.264 copy -+ keyframe flag - - - -audio_writer - -Audio Writer -───────────── -ADTS header + raw AAC -→ stream/audio.aac - - - -listener->audio_writer - - -AAC audio - - - -active_session - - - -active-session -───────────── -file at data/active-session -Python polls to discover -session dir - - - -listener->active_session - - -on SessionStart - - - -fmp4 - -stream/ -recording_000.mp4 -(fragmented MP4) - - - -ffmpeg_rec->fmp4 - - -copy - - - -udp_live - -UDP :4445 -(mpegts → mpv) - - - -ffmpeg_rec->udp_live - - -copy - - - -scene_ffmpeg - -Scene Detector -───────────── -connects to scene.sock -pipes H.264 → ffmpeg: -  CUDA decode -  select=gt(scene,thresh) -  showinfo → timestamps -  MJPEG → JPEG frames - - - -scene_relay->scene_ffmpeg - - -raw H.264 -(Unix socket) - - - -regression - - - -⚠ REGRESSED -───────────── -Scene relay (separate pipe) -breaks 'one behind' flush. -try_send drops → decoder -corruption until keyframe. -Fix: move scene detection -into server ffmpeg as 3rd -output branch (10-scene- -detect-to-rust.md) - - - -scene_relay->regression - - - - - -aac_file - -stream/ -audio.aac -(ADTS-wrapped) - - - -audio_writer->aac_file - - - - - -active_session->python - - -discovers -session dir - - - -frames - -frames/ -index.json + *.jpg - - - -scene_ffmpeg->frames - - -JPEG on -scene change - - - -audio_extract - -Audio Extractor -───────────── -reads audio.aac -ffmpeg → 16kHz mono WAV -chunks + transcript WAVs - - - -audio_dir - -audio/ -chunk_*.wav -transcript_*.wav - - - -audio_extract->audio_dir - - - - - -transcriber - -Transcriber -───────────── -faster-whisper (CUDA) -segment grouping -slider: chunk size + lines/group - - - -aac_file->audio_extract - - -reads - - - -audio_dir->transcriber - - -WAV chunks - - -