From ae895643733aa7c9fe9135ddef8155da592d2f5a Mon Sep 17 00:00:00 2001 From: Mariano Gabriel Date: Sun, 19 Oct 2025 22:49:36 -0300 Subject: [PATCH] add whisper to main command, ignore output files --- .gitignore | 11 ++++- README.md | 111 +++++++++++++++++++++++++++++---------------- output/.gitkeep | 0 process_meeting.py | 111 +++++++++++++++++++++++++++++++++++++++++---- samples/.gitkeep | 0 5 files changed, 183 insertions(+), 50 deletions(-) create mode 100644 output/.gitkeep create mode 100644 samples/.gitkeep diff --git a/.gitignore b/.gitignore index ad7a3dd..25ac2f6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,11 @@ -!samples/.gitkeep +# Sample videos samples/* +!samples/.gitkeep + +# Output files +output/* +!output/.gitkeep + +# Extracted frames +frames/ +__pycache__ \ No newline at end of file diff --git a/README.md b/README.md index 0732c59..4a1d7ab 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,13 @@ brew install ffmpeg pip install -r requirements.txt ``` -### 3. Optional: Install Alternative OCR Engines +### 3. Whisper (for audio transcription) + +```bash +pip install openai-whisper +``` + +### 4. Optional: Install Alternative OCR Engines ```bash # EasyOCR (better for rotated/handwritten text) @@ -53,52 +59,67 @@ pip install paddleocr ## Quick Start -### Basic Usage (Screen Content Only) +### Recommended: Run Everything in One Command + +```bash +python process_meeting.py samples/meeting.mkv --run-whisper +``` + +This will: +1. Run Whisper transcription (audio → text) +2. Extract frames every 5 seconds +3. Run OCR to extract screen text +4. Merge audio + screen content +5. Save everything to `output/` folder + +### Alternative: Use Existing Whisper Transcript + +If you already have a Whisper transcript: +```bash +python process_meeting.py samples/meeting.mkv --transcript output/meeting.json +``` + +### Screen Content Only (No Audio) ```bash python process_meeting.py samples/meeting.mkv ``` -This will: -1. Extract frames every 5 seconds -2. Run OCR to extract screen text -3. Save enhanced transcript to `meeting_enhanced.txt` - -### With Whisper Transcript - -First, generate a Whisper transcript: -```bash -whisper samples/meeting.mkv --model base --output_format json -``` - -Then process with screen content: -```bash -python process_meeting.py samples/meeting.mkv --transcript samples/meeting.json -``` - ## Usage Examples +### Run with different Whisper models +```bash +# Tiny model (fastest, less accurate) +python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model tiny + +# Small model (balanced) +python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model small + +# Large model (slowest, most accurate) +python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model large +``` + ### Extract frames at different intervals ```bash -# Every 10 seconds -python process_meeting.py samples/meeting.mkv --interval 10 +# Every 10 seconds (with Whisper) +python process_meeting.py samples/meeting.mkv --run-whisper --interval 10 # Every 3 seconds (more detailed) -python process_meeting.py samples/meeting.mkv --interval 3 +python process_meeting.py samples/meeting.mkv --run-whisper --interval 3 ``` ### Use scene detection (smarter, fewer frames) ```bash -python process_meeting.py samples/meeting.mkv --scene-detection +python process_meeting.py samples/meeting.mkv --run-whisper --scene-detection ``` ### Use different OCR engines ```bash # EasyOCR (good for varied layouts) -python process_meeting.py samples/meeting.mkv --ocr-engine easyocr +python process_meeting.py samples/meeting.mkv --run-whisper --ocr-engine easyocr # PaddleOCR (good for code/terminal) -python process_meeting.py samples/meeting.mkv --ocr-engine paddleocr +python process_meeting.py samples/meeting.mkv --run-whisper --ocr-engine paddleocr ``` ### Extract frames only (no merging) @@ -108,41 +129,48 @@ python process_meeting.py samples/meeting.mkv --extract-only ### Custom output location ```bash -python process_meeting.py samples/meeting.mkv --output my_meeting.txt --frames-dir my_frames/ +python process_meeting.py samples/meeting.mkv --run-whisper --output-dir my_outputs/ ``` ### Enable verbose logging ```bash # Show detailed debug information -python process_meeting.py samples/meeting.mkv --verbose - -# Short form -python process_meeting.py samples/meeting.mkv -v +python process_meeting.py samples/meeting.mkv --run-whisper --verbose ``` ## Output Files -After processing, you'll get: +All output files are saved to the `output/` directory by default: -- **`