From eb8b1f4f11d11e63d70408ab23dfd6700cc34c74 Mon Sep 17 00:00:00 2001 From: Mariano Gabriel Date: Thu, 4 Dec 2025 20:24:52 -0300 Subject: [PATCH] updated readme --- README.md | 76 ++++++++++++++++++------------------------------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index c1570c9..6a57483 100644 --- a/README.md +++ b/README.md @@ -46,25 +46,19 @@ For speaker diarization, you'll need a HuggingFace token with access to pyannote ## Quick Start -### Recommended: Embed Frames with Scene Detection +### Recommended Usage ```bash -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 10 --diarize ``` This will: -1. Run Whisper transcription (audio → text) -2. Extract frames at scene changes (smarter than fixed intervals) -3. Embed frame references in the transcript for LLM analysis +1. Run WhisperX transcription with speaker diarization +2. Extract frames at scene changes (threshold 10 = moderately sensitive) +3. Create an enhanced transcript with frame file references 4. Save everything to `output/` folder -### With Speaker Diarization (WhisperX) - -```bash -python process_meeting.py samples/meeting.mkv --run-whisper --diarize --embed-images --scene-detection -``` - -This uses WhisperX to identify different speakers in the transcript. +The `--embed-images` flag adds frame paths to the transcript (e.g., `Frame: frames/video_00257.jpg`), keeping the transcript small while frames stay in `frames/` folder for LLM access. ### Re-run with Cached Results @@ -76,48 +70,38 @@ python process_meeting.py samples/meeting.mkv --embed-images # Skip only specific cached items python process_meeting.py samples/meeting.mkv --embed-images --skip-cache-frames python process_meeting.py samples/meeting.mkv --embed-images --skip-cache-whisper -python process_meeting.py samples/meeting.mkv --embed-images --skip-cache-analysis # Force complete reprocessing -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --no-cache +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --diarize --no-cache ``` ## Usage Examples ### Scene Detection Options ```bash -# Default scene detection (threshold: 15) -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection +# Default threshold (15) +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --diarize -# More sensitive (more frames captured, threshold: 5) -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection --scene-threshold 5 +# More sensitive (more frames, threshold: 5) +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 5 --diarize # Less sensitive (fewer frames, threshold: 30) -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection --scene-threshold 30 +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 30 --diarize ``` ### Fixed Interval Extraction (alternative to scene detection) ```bash # Every 10 seconds -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --interval 10 +python process_meeting.py samples/meeting.mkv --embed-images --interval 10 --diarize # Every 3 seconds (more detailed) -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --interval 3 -``` - -### Frame Quality Options -```bash -# Default quality (80) -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection - -# Lower quality for smaller files (60) -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection --embed-quality 60 +python process_meeting.py samples/meeting.mkv --embed-images --interval 3 --diarize ``` ### Caching Examples ```bash # First run - processes everything -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 10 --diarize # Iterate on scene threshold (reuse whisper transcript) python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis @@ -126,17 +110,17 @@ python process_meeting.py samples/meeting.mkv --embed-images --scene-detection - python process_meeting.py samples/meeting.mkv --embed-images --skip-cache-whisper # Force complete reprocessing -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --no-cache +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --diarize --no-cache ``` ### Custom output location ```bash -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --output-dir my_outputs/ +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --diarize --output-dir my_outputs/ ``` ### Enable verbose logging ```bash -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection --verbose +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --diarize --verbose ``` ## Output Files @@ -175,24 +159,17 @@ This allows you to iterate on scene detection thresholds without re-running Whis ### Complete Workflow (One Command!) ```bash -# Process everything in one step with scene detection -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection - -# With speaker diarization -python process_meeting.py samples/meeting.mkv --run-whisper --diarize --embed-images --scene-detection +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 10 --diarize ``` ### Typical Iterative Workflow ```bash # First run - full processing -python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 10 --diarize # Adjust scene threshold (keeps cached whisper transcript) -python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 10 --skip-cache-frames --skip-cache-analysis - -# Try different frame quality -python process_meeting.py samples/meeting.mkv --embed-images --embed-quality 60 --skip-cache-frames --skip-cache-analysis +python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis ``` ### Example Prompt for Claude @@ -223,11 +200,8 @@ usage: process_meeting.py [-h] [--transcript TRANSCRIPT] [--run-whisper] Main Options: video Path to video file - --run-whisper Run Whisper transcription before processing - --whisper-model Whisper model: tiny, base, small, medium, large (default: medium) --diarize Use WhisperX with speaker diarization - --embed-images Embed frame references for LLM analysis (recommended) - --embed-quality JPEG quality for frames (default: 80) + --embed-images Add frame file references to transcript (recommended) Frame Extraction: --scene-detection Use FFmpeg scene detection (recommended) @@ -241,6 +215,8 @@ Caching: --skip-cache-analysis Re-run analysis only Other: + --run-whisper Run Whisper (without diarization) + --whisper-model Whisper model: tiny, base, small, medium, large (default: medium) --transcript, -t Path to existing Whisper transcript (JSON or TXT) --output, -o Output file for enhanced transcript --output-dir Directory for output files (default: output/) @@ -262,10 +238,6 @@ Other: - **Whisper** (`--run-whisper`): Standard transcription, fast - **WhisperX** (`--run-whisper --diarize`): Adds speaker identification, requires HuggingFace token -### Frame Quality -- Default quality (80) works well for most cases -- Use `--embed-quality 60` for smaller files if storage is a concern - ### Deduplication - Enabled by default - removes similar consecutive frames - Disable with `--no-deduplicate` if slides/screens change subtly