mitus/process_meeting.py

#!/usr/bin/env python3
"""
Process meeting recordings to extract audio + screen content.
Combines Whisper transcripts with OCR from screen shares.
"""
import argparse
from pathlib import Path
import sys
import json
import logging
import subprocess
import shutil

from meetus.frame_extractor import FrameExtractor
from meetus.ocr_processor import OCRProcessor
from meetus.vision_processor import VisionProcessor
from meetus.transcript_merger import TranscriptMerger

logger = logging.getLogger(__name__)


def setup_logging(verbose: bool = False):
    """
    Configure logging for the application.

    Args:
        verbose: If True, set DEBUG level, otherwise INFO
    """
    level = logging.DEBUG if verbose else logging.INFO

    # Configure root logger
    logging.basicConfig(
        level=level,
        format='%(asctime)s - %(levelname)s - %(message)s',
        datefmt='%H:%M:%S'
    )

    # Suppress verbose output from libraries
    logging.getLogger('PIL').setLevel(logging.WARNING)
    logging.getLogger('easyocr').setLevel(logging.WARNING)
    logging.getLogger('paddleocr').setLevel(logging.WARNING)


def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
    """
    Run Whisper transcription on video file.

    Args:
        video_path: Path to video file
        model: Whisper model to use (tiny, base, small, medium, large)
        output_dir: Directory to save output

    Returns:
        Path to generated JSON transcript
    """
    # Check if whisper is installed
    if not shutil.which("whisper"):
        logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
        sys.exit(1)

    logger.info(f"Running Whisper transcription (model: {model})...")
    logger.info("This may take a few minutes depending on video length...")

    # Run whisper command
    cmd = [
        "whisper",
        str(video_path),
        "--model", model,
        "--output_format", "json",
        "--output_dir", output_dir
    ]

    try:
        result = subprocess.run(
            cmd,
            check=True,
            capture_output=True,
            text=True
        )

        # Whisper outputs to <output_dir>/<video_stem>.json
        transcript_path = Path(output_dir) / f"{video_path.stem}.json"

        if transcript_path.exists():
            logger.info(f"✓ Whisper transcription completed: {transcript_path}")
            return transcript_path
        else:
            logger.error("Whisper completed but transcript file not found")
            sys.exit(1)

    except subprocess.CalledProcessError as e:
        logger.error(f"Whisper failed: {e.stderr}")
        sys.exit(1)


def main():
    parser = argparse.ArgumentParser(
        description="Extract screen content from meeting recordings and merge with transcripts",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run Whisper + vision analysis (recommended for code/dashboards)
  python process_meeting.py samples/meeting.mkv --run-whisper --use-vision

  # Use vision with specific context hint
  python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --vision-context code

  # Traditional OCR approach
  python process_meeting.py samples/meeting.mkv --run-whisper

  # Re-run analysis using cached frames and transcript
  python process_meeting.py samples/meeting.mkv --use-vision

  # Force reprocessing (ignore cache)
  python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --no-cache

  # Use scene detection for fewer frames
  python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --scene-detection
        """
    )

    parser.add_argument(
        'video',
        help='Path to video file'
    )

    parser.add_argument(
        '--transcript', '-t',
        help='Path to Whisper transcript (JSON or TXT)',
        default=None
    )

    parser.add_argument(
        '--run-whisper',
        action='store_true',
        help='Run Whisper transcription before processing'
    )

    parser.add_argument(
        '--whisper-model',
        choices=['tiny', 'base', 'small', 'medium', 'large'],
        help='Whisper model to use (default: base)',
        default='base'
    )

    parser.add_argument(
        '--output', '-o',
        help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
        default=None
    )

    parser.add_argument(
        '--output-dir',
        help='Directory for output files (default: output/)',
        default='output'
    )

    parser.add_argument(
        '--frames-dir',
        help='Directory to save extracted frames (default: frames/)',
        default='frames'
    )

    parser.add_argument(
        '--interval',
        type=int,
        help='Extract frame every N seconds (default: 5)',
        default=5
    )

    parser.add_argument(
        '--scene-detection',
        action='store_true',
        help='Use scene detection instead of interval extraction'
    )

    parser.add_argument(
        '--ocr-engine',
        choices=['tesseract', 'easyocr', 'paddleocr'],
        help='OCR engine to use (default: tesseract)',
        default='tesseract'
    )

    parser.add_argument(
        '--use-vision',
        action='store_true',
        help='Use local vision model (Ollama) instead of OCR for better context understanding'
    )

    parser.add_argument(
        '--vision-model',
        help='Vision model to use with Ollama (default: llava:13b)',
        default='llava:13b'
    )

    parser.add_argument(
        '--vision-context',
        choices=['meeting', 'dashboard', 'code', 'console'],
        help='Context hint for vision analysis (default: meeting)',
        default='meeting'
    )

    parser.add_argument(
        '--no-cache',
        action='store_true',
        help='Disable caching - reprocess everything even if outputs exist'
    )

    parser.add_argument(
        '--no-deduplicate',
        action='store_true',
        help='Disable text deduplication'
    )

    parser.add_argument(
        '--extract-only',
        action='store_true',
        help='Only extract frames and OCR, skip transcript merging'
    )

    parser.add_argument(
        '--format',
        choices=['detailed', 'compact'],
        help='Output format style (default: detailed)',
        default='detailed'
    )

    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Enable verbose logging (DEBUG level)'
    )

    args = parser.parse_args()

    # Setup logging
    setup_logging(args.verbose)

    # Validate video path
    video_path = Path(args.video)
    if not video_path.exists():
        logger.error(f"Video file not found: {args.video}")
        sys.exit(1)

    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Set default output path
    if args.output is None:
        args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")

    # Define cache paths
    whisper_cache = output_dir / f"{video_path.stem}.json"
    analysis_cache = output_dir / f"{video_path.stem}_{'vision' if args.use_vision else 'ocr'}.json"
    frames_cache_dir = Path(args.frames_dir)

    # Check for cached Whisper transcript
    if args.run_whisper:
        if not args.no_cache and whisper_cache.exists():
            logger.info(f"✓ Found cached Whisper transcript: {whisper_cache}")
            args.transcript = str(whisper_cache)
        else:
            logger.info("=" * 80)
            logger.info("STEP 0: Running Whisper Transcription")
            logger.info("=" * 80)
            transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
            args.transcript = str(transcript_path)
            logger.info("")

    logger.info("=" * 80)
    logger.info("MEETING PROCESSOR")
    logger.info("=" * 80)
    logger.info(f"Video: {video_path.name}")
    logger.info(f"Analysis: {'Vision Model' if args.use_vision else f'OCR ({args.ocr_engine})'}")
    if args.use_vision:
        logger.info(f"Vision Model: {args.vision_model}")
        logger.info(f"Context: {args.vision_context}")
    logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
    if args.transcript:
        logger.info(f"Transcript: {args.transcript}")
    logger.info(f"Caching: {'Disabled' if args.no_cache else 'Enabled'}")
    logger.info("=" * 80)

    # Step 1: Extract frames (with caching)
    logger.info("Step 1: Extracting frames from video...")

    # Check if frames already exist
    existing_frames = list(frames_cache_dir.glob(f"{video_path.stem}_*.jpg")) if frames_cache_dir.exists() else []

    if not args.no_cache and existing_frames and len(existing_frames) > 0:
        logger.info(f"✓ Found {len(existing_frames)} cached frames in {args.frames_dir}/")
        # Build frames_info from existing files
        frames_info = []
        for frame_path in sorted(existing_frames):
            # Try to extract timestamp from filename (e.g., video_00001_12.34s.jpg)
            try:
                timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
                timestamp = float(timestamp_str)
            except:
                timestamp = 0.0
            frames_info.append((str(frame_path), timestamp))
    else:
        extractor = FrameExtractor(str(video_path), args.frames_dir)

        if args.scene_detection:
            frames_info = extractor.extract_scene_changes()
        else:
            frames_info = extractor.extract_by_interval(args.interval)

        if not frames_info:
            logger.error("No frames extracted")
            sys.exit(1)

        logger.info(f"✓ Extracted {len(frames_info)} frames")

    # Step 2: Run analysis on frames (with caching)
    if not args.no_cache and analysis_cache.exists():
        logger.info(f"✓ Found cached analysis results: {analysis_cache}")
        with open(analysis_cache, 'r', encoding='utf-8') as f:
            screen_segments = json.load(f)
        logger.info(f"✓ Loaded {len(screen_segments)} analyzed frames from cache")
    else:
        if args.use_vision:
            # Use vision model
            logger.info("Step 2: Running vision analysis on extracted frames...")
            try:
                vision = VisionProcessor(model=args.vision_model)
                screen_segments = vision.process_frames(
                    frames_info,
                    context=args.vision_context,
                    deduplicate=not args.no_deduplicate
                )
                logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")

            except ImportError as e:
                logger.error(f"{e}")
                sys.exit(1)
        else:
            # Use OCR
            logger.info("Step 2: Running OCR on extracted frames...")
            try:
                ocr = OCRProcessor(engine=args.ocr_engine)
                screen_segments = ocr.process_frames(
                    frames_info,
                    deduplicate=not args.no_deduplicate
                )
                logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")

            except ImportError as e:
                logger.error(f"{e}")
                logger.error(f"To install {args.ocr_engine}:")
                logger.error(f"  pip install {args.ocr_engine}")
                sys.exit(1)

        # Save analysis results as JSON
        with open(analysis_cache, 'w', encoding='utf-8') as f:
            json.dump(screen_segments, f, indent=2, ensure_ascii=False)
        logger.info(f"✓ Saved analysis results to: {analysis_cache}")

    if args.extract_only:
        logger.info("Done! (extract-only mode)")
        return

    # Step 3: Merge with transcript (if provided)
    merger = TranscriptMerger()

    if args.transcript:
        logger.info("Step 3: Merging with Whisper transcript...")
        transcript_path = Path(args.transcript)

        if not transcript_path.exists():
            logger.warning(f"Transcript not found: {args.transcript}")
            logger.info("Proceeding with screen content only...")
            audio_segments = []
        else:
            audio_segments = merger.load_whisper_transcript(str(transcript_path))
            logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
    else:
        logger.info("No transcript provided, using screen content only...")
        audio_segments = []

    # Merge and format
    merged = merger.merge_transcripts(audio_segments, screen_segments)
    formatted = merger.format_for_claude(merged, format_style=args.format)

    # Save output
    merger.save_transcript(formatted, args.output)

    logger.info("=" * 80)
    logger.info("✓ PROCESSING COMPLETE!")
    logger.info("=" * 80)
    logger.info(f"Enhanced transcript: {args.output}")
    logger.info(f"OCR data: {ocr_output}")
    logger.info(f"Frames: {args.frames_dir}/")
    logger.info("")
    logger.info("You can now use the enhanced transcript with Claude for summarization!")


if __name__ == '__main__':
    main()