From ae895643733aa7c9fe9135ddef8155da592d2f5a Mon Sep 17 00:00:00 2001
From: Mariano Gabriel <pensalo@gmail.com>
Date: Sun, 19 Oct 2025 22:49:36 -0300
Subject: [PATCH] add whisper to main command, ignore output files

---
 .gitignore         |  11 ++++-
 README.md          | 111 +++++++++++++++++++++++++++++----------------
 output/.gitkeep    |   0
 process_meeting.py | 111 +++++++++++++++++++++++++++++++++++++++++----
 samples/.gitkeep   |   0
 5 files changed, 183 insertions(+), 50 deletions(-)
 create mode 100644 output/.gitkeep
 create mode 100644 samples/.gitkeep

diff --git a/.gitignore b/.gitignore
index ad7a3dd..25ac2f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,11 @@
-!samples/.gitkeep
+# Sample videos
 samples/*
+!samples/.gitkeep
+
+# Output files
+output/*
+!output/.gitkeep
+
+# Extracted frames
+frames/
+__pycache__
\ No newline at end of file
diff --git a/README.md b/README.md
index 0732c59..4a1d7ab 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,13 @@ brew install ffmpeg
 pip install -r requirements.txt
 ```
 
-### 3. Optional: Install Alternative OCR Engines
+### 3. Whisper (for audio transcription)
+
+```bash
+pip install openai-whisper
+```
+
+### 4. Optional: Install Alternative OCR Engines
 
 ```bash
 # EasyOCR (better for rotated/handwritten text)
@@ -53,52 +59,67 @@ pip install paddleocr
 
 ## Quick Start
 
-### Basic Usage (Screen Content Only)
+### Recommended: Run Everything in One Command
+
+```bash
+python process_meeting.py samples/meeting.mkv --run-whisper
+```
+
+This will:
+1. Run Whisper transcription (audio → text)
+2. Extract frames every 5 seconds
+3. Run OCR to extract screen text
+4. Merge audio + screen content
+5. Save everything to `output/` folder
+
+### Alternative: Use Existing Whisper Transcript
+
+If you already have a Whisper transcript:
+```bash
+python process_meeting.py samples/meeting.mkv --transcript output/meeting.json
+```
+
+### Screen Content Only (No Audio)
 
 ```bash
 python process_meeting.py samples/meeting.mkv
 ```
 
-This will:
-1. Extract frames every 5 seconds
-2. Run OCR to extract screen text
-3. Save enhanced transcript to `meeting_enhanced.txt`
-
-### With Whisper Transcript
-
-First, generate a Whisper transcript:
-```bash
-whisper samples/meeting.mkv --model base --output_format json
-```
-
-Then process with screen content:
-```bash
-python process_meeting.py samples/meeting.mkv --transcript samples/meeting.json
-```
-
 ## Usage Examples
 
+### Run with different Whisper models
+```bash
+# Tiny model (fastest, less accurate)
+python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model tiny
+
+# Small model (balanced)
+python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model small
+
+# Large model (slowest, most accurate)
+python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model large
+```
+
 ### Extract frames at different intervals
 ```bash
-# Every 10 seconds
-python process_meeting.py samples/meeting.mkv --interval 10
+# Every 10 seconds (with Whisper)
+python process_meeting.py samples/meeting.mkv --run-whisper --interval 10
 
 # Every 3 seconds (more detailed)
-python process_meeting.py samples/meeting.mkv --interval 3
+python process_meeting.py samples/meeting.mkv --run-whisper --interval 3
 ```
 
 ### Use scene detection (smarter, fewer frames)
 ```bash
-python process_meeting.py samples/meeting.mkv --scene-detection
+python process_meeting.py samples/meeting.mkv --run-whisper --scene-detection
 ```
 
 ### Use different OCR engines
 ```bash
 # EasyOCR (good for varied layouts)
-python process_meeting.py samples/meeting.mkv --ocr-engine easyocr
+python process_meeting.py samples/meeting.mkv --run-whisper --ocr-engine easyocr
 
 # PaddleOCR (good for code/terminal)
-python process_meeting.py samples/meeting.mkv --ocr-engine paddleocr
+python process_meeting.py samples/meeting.mkv --run-whisper --ocr-engine paddleocr
 ```
 
 ### Extract frames only (no merging)
@@ -108,41 +129,48 @@ python process_meeting.py samples/meeting.mkv --extract-only
 
 ### Custom output location
 ```bash
-python process_meeting.py samples/meeting.mkv --output my_meeting.txt --frames-dir my_frames/
+python process_meeting.py samples/meeting.mkv --run-whisper --output-dir my_outputs/
 ```
 
 ### Enable verbose logging
 ```bash
 # Show detailed debug information
-python process_meeting.py samples/meeting.mkv --verbose
-
-# Short form
-python process_meeting.py samples/meeting.mkv -v
+python process_meeting.py samples/meeting.mkv --run-whisper --verbose
 ```
 
 ## Output Files
 
-After processing, you'll get:
+All output files are saved to the `output/` directory by default:
 
-- **`<video>_enhanced.txt`** - Enhanced transcript ready for Claude
-- **`<video>_ocr.json`** - Raw OCR data with timestamps
+- **`output/<video>_enhanced.txt`** - Enhanced transcript ready for Claude
+- **`output/<video>.json`** - Whisper transcript (if `--run-whisper` was used)
+- **`output/<video>_ocr.json`** - Raw OCR data with timestamps
 - **`frames/`** - Extracted video frames (JPG files)
 
 ## Workflow for Meeting Analysis
 
-### Complete Workflow
+### Complete Workflow (One Command!)
 
 ```bash
-# 1. Extract audio and transcribe with Whisper
-whisper samples/alo-intro1.mkv --model base --output_format json
+# Process everything in one step
+python process_meeting.py samples/alo-intro1.mkv --run-whisper --scene-detection
+
+# Output will be in output/alo-intro1_enhanced.txt
+```
+
+### Traditional Workflow (Separate Steps)
+
+```bash
+# 1. Extract audio and transcribe with Whisper (optional, if not using --run-whisper)
+whisper samples/alo-intro1.mkv --model base --output_format json --output_dir output
 
 # 2. Process video to extract screen content
 python process_meeting.py samples/alo-intro1.mkv \
-    --transcript samples/alo-intro1.json \
+    --transcript output/alo-intro1.json \
     --scene-detection
 
 # 3. Use the enhanced transcript with Claude
-# Copy the content from alo-intro1_enhanced.txt and paste into Claude
+# Copy the content from output/alo-intro1_enhanced.txt and paste into Claude
 ```
 
 ### Example Prompt for Claude
@@ -160,7 +188,9 @@ Please summarize this meeting transcript. Pay special attention to:
 ## Command Reference
 
 ```
-usage: process_meeting.py [-h] [--transcript TRANSCRIPT] [--output OUTPUT]
+usage: process_meeting.py [-h] [--transcript TRANSCRIPT] [--run-whisper]
+                          [--whisper-model {tiny,base,small,medium,large}]
+                          [--output OUTPUT] [--output-dir OUTPUT_DIR]
                           [--frames-dir FRAMES_DIR] [--interval INTERVAL]
                           [--scene-detection]
                           [--ocr-engine {tesseract,easyocr,paddleocr}]
@@ -171,7 +201,10 @@ usage: process_meeting.py [-h] [--transcript TRANSCRIPT] [--output OUTPUT]
 Options:
   video                 Path to video file
   --transcript, -t      Path to Whisper transcript (JSON or TXT)
+  --run-whisper         Run Whisper transcription before processing
+  --whisper-model       Whisper model: tiny, base, small, medium, large (default: base)
   --output, -o          Output file for enhanced transcript
+  --output-dir          Directory for output files (default: output/)
   --frames-dir          Directory to save extracted frames (default: frames/)
   --interval            Extract frame every N seconds (default: 5)
   --scene-detection     Use scene detection instead of interval extraction
diff --git a/output/.gitkeep b/output/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/process_meeting.py b/process_meeting.py
index 92b5b64..5f0b38b 100644
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -8,6 +8,8 @@ from pathlib import Path
 import sys
 import json
 import logging
+import subprocess
+import shutil
 
 from meetus.frame_extractor import FrameExtractor
 from meetus.ocr_processor import OCRProcessor
@@ -38,23 +40,78 @@ def setup_logging(verbose: bool = False):
     logging.getLogger('paddleocr').setLevel(logging.WARNING)
 
 
+def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
+    """
+    Run Whisper transcription on video file.
+
+    Args:
+        video_path: Path to video file
+        model: Whisper model to use (tiny, base, small, medium, large)
+        output_dir: Directory to save output
+
+    Returns:
+        Path to generated JSON transcript
+    """
+    # Check if whisper is installed
+    if not shutil.which("whisper"):
+        logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
+        sys.exit(1)
+
+    logger.info(f"Running Whisper transcription (model: {model})...")
+    logger.info("This may take a few minutes depending on video length...")
+
+    # Run whisper command
+    cmd = [
+        "whisper",
+        str(video_path),
+        "--model", model,
+        "--output_format", "json",
+        "--output_dir", output_dir
+    ]
+
+    try:
+        result = subprocess.run(
+            cmd,
+            check=True,
+            capture_output=True,
+            text=True
+        )
+
+        # Whisper outputs to <output_dir>/<video_stem>.json
+        transcript_path = Path(output_dir) / f"{video_path.stem}.json"
+
+        if transcript_path.exists():
+            logger.info(f"✓ Whisper transcription completed: {transcript_path}")
+            return transcript_path
+        else:
+            logger.error("Whisper completed but transcript file not found")
+            sys.exit(1)
+
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Whisper failed: {e.stderr}")
+        sys.exit(1)
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Extract screen content from meeting recordings and merge with transcripts",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-  # Process video and extract frames only
-  python process_meeting.py samples/meeting.mkv --extract-only
+  # Run Whisper + full processing in one command
+  python process_meeting.py samples/meeting.mkv --run-whisper
 
-  # Process video with Whisper transcript
-  python process_meeting.py samples/meeting.mkv --transcript meeting.json
+  # Process video with existing Whisper transcript
+  python process_meeting.py samples/meeting.mkv --transcript output/meeting.json
 
   # Use scene detection instead of interval
-  python process_meeting.py samples/meeting.mkv --scene-detection
+  python process_meeting.py samples/meeting.mkv --run-whisper --scene-detection
 
-  # Use different OCR engine
-  python process_meeting.py samples/meeting.mkv --ocr-engine easyocr
+  # Use different Whisper model and OCR engine
+  python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model small --ocr-engine easyocr
+
+  # Extract frames only (no transcript)
+  python process_meeting.py samples/meeting.mkv --extract-only
         """
     )
 
@@ -69,12 +126,31 @@ Examples:
         default=None
     )
 
+    parser.add_argument(
+        '--run-whisper',
+        action='store_true',
+        help='Run Whisper transcription before processing'
+    )
+
+    parser.add_argument(
+        '--whisper-model',
+        choices=['tiny', 'base', 'small', 'medium', 'large'],
+        help='Whisper model to use (default: base)',
+        default='base'
+    )
+
     parser.add_argument(
         '--output', '-o',
-        help='Output file for enhanced transcript (default: <video>_enhanced.txt)',
+        help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
         default=None
     )
 
+    parser.add_argument(
+        '--output-dir',
+        help='Directory for output files (default: output/)',
+        default='output'
+    )
+
     parser.add_argument(
         '--frames-dir',
         help='Directory to save extracted frames (default: frames/)',
@@ -137,9 +213,22 @@ Examples:
         logger.error(f"Video file not found: {args.video}")
         sys.exit(1)
 
+    # Create output directory
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
     # Set default output path
     if args.output is None:
-        args.output = video_path.stem + '_enhanced.txt'
+        args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
+
+    # Run Whisper if requested
+    if args.run_whisper:
+        logger.info("=" * 80)
+        logger.info("STEP 0: Running Whisper Transcription")
+        logger.info("=" * 80)
+        transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
+        args.transcript = str(transcript_path)
+        logger.info("")
 
     logger.info("=" * 80)
     logger.info("MEETING PROCESSOR")
@@ -147,6 +236,8 @@ Examples:
     logger.info(f"Video: {video_path.name}")
     logger.info(f"OCR Engine: {args.ocr_engine}")
     logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
+    if args.transcript:
+        logger.info(f"Transcript: {args.transcript}")
     logger.info("=" * 80)
 
     # Step 1: Extract frames
@@ -181,7 +272,7 @@ Examples:
         sys.exit(1)
 
     # Save OCR results as JSON
-    ocr_output = video_path.stem + '_ocr.json'
+    ocr_output = output_dir / f"{video_path.stem}_ocr.json"
     with open(ocr_output, 'w', encoding='utf-8') as f:
         json.dump(screen_segments, f, indent=2, ensure_ascii=False)
     logger.info(f"✓ Saved OCR results to: {ocr_output}")
diff --git a/samples/.gitkeep b/samples/.gitkeep
new file mode 100644
index 0000000..e69de29