add whisper to main command, ignore output files

2025-10-19 22:49:36 -03:00
parent 93e0c06d38
commit ae89564373
5 changed files with 183 additions and 50 deletions
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -8,6 +8,8 @@ from pathlib import Path
 import sys
 import json
 import logging
+import subprocess
+import shutil

 from meetus.frame_extractor import FrameExtractor
 from meetus.ocr_processor import OCRProcessor
@@ -38,23 +40,78 @@ def setup_logging(verbose: bool = False):
    logging.getLogger('paddleocr').setLevel(logging.WARNING)


+def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
+    """
+    Run Whisper transcription on video file.
+
+    Args:
+        video_path: Path to video file
+        model: Whisper model to use (tiny, base, small, medium, large)
+        output_dir: Directory to save output
+
+    Returns:
+        Path to generated JSON transcript
+    """
+    # Check if whisper is installed
+    if not shutil.which("whisper"):
+        logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
+        sys.exit(1)
+
+    logger.info(f"Running Whisper transcription (model: {model})...")
+    logger.info("This may take a few minutes depending on video length...")
+
+    # Run whisper command
+    cmd = [
+        "whisper",
+        str(video_path),
+        "--model", model,
+        "--output_format", "json",
+        "--output_dir", output_dir
+    ]
+
+    try:
+        result = subprocess.run(
+            cmd,
+            check=True,
+            capture_output=True,
+            text=True
+        )
+
+        # Whisper outputs to <output_dir>/<video_stem>.json
+        transcript_path = Path(output_dir) / f"{video_path.stem}.json"
+
+        if transcript_path.exists():
+            logger.info(f"✓ Whisper transcription completed: {transcript_path}")
+            return transcript_path
+        else:
+            logger.error("Whisper completed but transcript file not found")
+            sys.exit(1)
+
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Whisper failed: {e.stderr}")
+        sys.exit(1)
+
+
 def main():
    parser = argparse.ArgumentParser(
        description="Extract screen content from meeting recordings and merge with transcripts",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
-  # Process video and extract frames only
-  python process_meeting.py samples/meeting.mkv --extract-only
+  # Run Whisper + full processing in one command
+  python process_meeting.py samples/meeting.mkv --run-whisper

-  # Process video with Whisper transcript
-  python process_meeting.py samples/meeting.mkv --transcript meeting.json
+  # Process video with existing Whisper transcript
+  python process_meeting.py samples/meeting.mkv --transcript output/meeting.json

  # Use scene detection instead of interval
-  python process_meeting.py samples/meeting.mkv --scene-detection
+  python process_meeting.py samples/meeting.mkv --run-whisper --scene-detection

-  # Use different OCR engine
-  python process_meeting.py samples/meeting.mkv --ocr-engine easyocr
+  # Use different Whisper model and OCR engine
+  python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model small --ocr-engine easyocr
+
+  # Extract frames only (no transcript)
+  python process_meeting.py samples/meeting.mkv --extract-only
        """
    )

@@ -69,12 +126,31 @@ Examples:
        default=None
    )

+    parser.add_argument(
+        '--run-whisper',
+        action='store_true',
+        help='Run Whisper transcription before processing'
+    )
+
+    parser.add_argument(
+        '--whisper-model',
+        choices=['tiny', 'base', 'small', 'medium', 'large'],
+        help='Whisper model to use (default: base)',
+        default='base'
+    )
+
    parser.add_argument(
        '--output', '-o',
-        help='Output file for enhanced transcript (default: <video>_enhanced.txt)',
+        help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
        default=None
    )

+    parser.add_argument(
+        '--output-dir',
+        help='Directory for output files (default: output/)',
+        default='output'
+    )
+
    parser.add_argument(
        '--frames-dir',
        help='Directory to save extracted frames (default: frames/)',
@@ -137,9 +213,22 @@ Examples:
        logger.error(f"Video file not found: {args.video}")
        sys.exit(1)

+    # Create output directory
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
    # Set default output path
    if args.output is None:
-        args.output = video_path.stem + '_enhanced.txt'
+        args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
+
+    # Run Whisper if requested
+    if args.run_whisper:
+        logger.info("=" * 80)
+        logger.info("STEP 0: Running Whisper Transcription")
+        logger.info("=" * 80)
+        transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
+        args.transcript = str(transcript_path)
+        logger.info("")

    logger.info("=" * 80)
    logger.info("MEETING PROCESSOR")
@@ -147,6 +236,8 @@ Examples:
    logger.info(f"Video: {video_path.name}")
    logger.info(f"OCR Engine: {args.ocr_engine}")
    logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
+    if args.transcript:
+        logger.info(f"Transcript: {args.transcript}")
    logger.info("=" * 80)

    # Step 1: Extract frames
@@ -181,7 +272,7 @@ Examples:
        sys.exit(1)

    # Save OCR results as JSON
-    ocr_output = video_path.stem + '_ocr.json'
+    ocr_output = output_dir / f"{video_path.stem}_ocr.json"
    with open(ocr_output, 'w', encoding='utf-8') as f:
        json.dump(screen_segments, f, indent=2, ensure_ascii=False)
    logger.info(f"✓ Saved OCR results to: {ocr_output}")