mitus/process_meeting.py

#!/usr/bin/env python3
"""
Process meeting recordings to extract audio + screen content.
Combines Whisper transcripts with vision analysis or OCR from screen shares.
"""
import argparse
import sys
import logging

from meetus.workflow import WorkflowConfig, ProcessingWorkflow


def setup_logging(verbose: bool = False):
    """Configure logging for the application."""
    level = logging.DEBUG if verbose else logging.INFO

    logging.basicConfig(
        level=level,
        format='%(asctime)s - %(levelname)s - %(message)s',
        datefmt='%H:%M:%S'
    )

    # Suppress verbose output from libraries
    logging.getLogger('PIL').setLevel(logging.WARNING)
    logging.getLogger('easyocr').setLevel(logging.WARNING)
    logging.getLogger('paddleocr').setLevel(logging.WARNING)


def main():
    parser = argparse.ArgumentParser(
        description="Extract screen content from meeting recordings and merge with transcripts",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Reference frames for LLM analysis (recommended - transcript includes frame paths)
  python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection

  # Adjust frame extraction quality (lower = smaller files)
  python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection

  # Hybrid approach: OpenCV + OCR (extracts text from frames)
  python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection

  # Hybrid + LLM cleanup (best for code formatting)
  python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --hybrid-llm-cleanup --scene-detection

  # Iterate on scene threshold (reuse whisper transcript)
  python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis
        """
    )

    # Required arguments
    parser.add_argument(
        'video',
        help='Path to video file'
    )

    # Whisper options
    parser.add_argument(
        '--transcript', '-t',
        help='Path to Whisper transcript (JSON or TXT)',
        default=None
    )
    parser.add_argument(
        '--run-whisper',
        action='store_true',
        help='Run Whisper transcription before processing'
    )
    parser.add_argument(
        '--whisper-model',
        choices=['tiny', 'base', 'small', 'medium', 'large'],
        help='Whisper model to use (default: medium)',
        default='medium'
    )
    parser.add_argument(
        '--diarize',
        action='store_true',
        help='Use WhisperX with speaker diarization (requires whisperx and HuggingFace token)'
    )

    # Output options
    parser.add_argument(
        '--output', '-o',
        help='Output file for enhanced transcript (default: auto-generated in output directory)',
        default=None
    )
    parser.add_argument(
        '--output-dir',
        help='Base directory for outputs (default: output/)',
        default='output'
    )

    # Frame extraction options
    parser.add_argument(
        '--interval',
        type=int,
        help='Extract frame every N seconds (default: 5)',
        default=5
    )
    parser.add_argument(
        '--scene-detection',
        action='store_true',
        help='Use scene detection instead of interval extraction'
    )
    parser.add_argument(
        '--scene-threshold',
        type=float,
        help='Scene detection threshold (0-100, lower=more sensitive, default: 15)',
        default=15.0
    )

    # Analysis options
    parser.add_argument(
        '--ocr-engine',
        choices=['tesseract', 'easyocr', 'paddleocr'],
        help='OCR engine to use (default: tesseract)',
        default='tesseract'
    )
    parser.add_argument(
        '--use-vision',
        action='store_true',
        help='Use local vision model (Ollama) instead of OCR for better context understanding'
    )
    parser.add_argument(
        '--use-hybrid',
        action='store_true',
        help='Use hybrid approach: OpenCV text detection + OCR (more accurate than vision models)'
    )
    parser.add_argument(
        '--hybrid-llm-cleanup',
        action='store_true',
        help='Use LLM to clean up OCR output and preserve code formatting (requires --use-hybrid)'
    )
    parser.add_argument(
        '--hybrid-llm-model',
        help='LLM model for cleanup (default: llama3.2:3b)',
        default='llama3.2:3b'
    )
    parser.add_argument(
        '--vision-model',
        help='Vision model to use with Ollama (default: llava:13b)',
        default='llava:13b'
    )
    parser.add_argument(
        '--vision-context',
        choices=['meeting', 'dashboard', 'code', 'console'],
        help='Context hint for vision analysis (default: meeting)',
        default='meeting'
    )

    # Processing options
    parser.add_argument(
        '--no-cache',
        action='store_true',
        help='Disable caching - reprocess everything even if outputs exist'
    )
    parser.add_argument(
        '--skip-cache-frames',
        action='store_true',
        help='Skip cached frames, re-extract from video (but keep whisper/analysis cache)'
    )
    parser.add_argument(
        '--skip-cache-whisper',
        action='store_true',
        help='Skip cached whisper transcript, re-run transcription (but keep frames/analysis cache)'
    )
    parser.add_argument(
        '--skip-cache-analysis',
        action='store_true',
        help='Skip cached analysis, re-run OCR/vision (but keep frames/whisper cache)'
    )
    parser.add_argument(
        '--no-deduplicate',
        action='store_true',
        help='Disable text deduplication'
    )
    parser.add_argument(
        '--extract-only',
        action='store_true',
        help='Only extract frames and analyze, skip transcript merging'
    )
    parser.add_argument(
        '--format',
        choices=['detailed', 'compact'],
        help='Output format style (default: detailed)',
        default='detailed'
    )
    parser.add_argument(
        '--embed-images',
        action='store_true',
        help='Skip OCR/vision analysis and reference frame files directly (faster, lets LLM analyze images)'
    )
    parser.add_argument(
        '--embed-quality',
        type=int,
        help='JPEG quality for extracted frames (default: 80, lower = smaller files)',
        default=80
    )

    # Logging
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Enable verbose logging (DEBUG level)'
    )

    args = parser.parse_args()

    # Setup logging
    setup_logging(args.verbose)

    try:
        # Create workflow configuration
        config = WorkflowConfig(**vars(args))

        # Run processing workflow
        workflow = ProcessingWorkflow(config)
        result = workflow.run()

        # Print final summary
        print("\n" + "=" * 80)
        print("✓ SUCCESS!")
        print("=" * 80)
        print(f"Output directory: {result['output_dir']}")
        if result.get('enhanced_transcript'):
            print(f"Enhanced transcript ready for AI summarization!")
        print("=" * 80)

        return 0

    except FileNotFoundError as e:
        logging.error(f"File not found: {e}")
        return 1
    except RuntimeError as e:
        logging.error(f"Processing failed: {e}")
        return 1
    except KeyboardInterrupt:
        logging.warning("\nProcessing interrupted by user")
        return 130
    except Exception as e:
        logging.exception(f"Unexpected error: {e}")
        return 1


if __name__ == '__main__':
    sys.exit(main())