#!/usr/bin/env python3 """ Process meeting recordings to extract audio + screen content. Combines Whisper transcripts with vision analysis or OCR from screen shares. """ import argparse import sys import logging from meetus.workflow import WorkflowConfig, ProcessingWorkflow def setup_logging(verbose: bool = False): """Configure logging for the application.""" level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( level=level, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%H:%M:%S' ) # Suppress verbose output from libraries logging.getLogger('PIL').setLevel(logging.WARNING) logging.getLogger('easyocr').setLevel(logging.WARNING) logging.getLogger('paddleocr').setLevel(logging.WARNING) def main(): parser = argparse.ArgumentParser( description="Extract screen content from meeting recordings and merge with transcripts", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Reference frames for LLM analysis (recommended - transcript includes frame paths) python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection # Adjust frame extraction quality (lower = smaller files) python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection # Hybrid approach: OpenCV + OCR (extracts text from frames) python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection # Hybrid + LLM cleanup (best for code formatting) python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --hybrid-llm-cleanup --scene-detection # Iterate on scene threshold (reuse whisper transcript) python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis """ ) # Required arguments parser.add_argument( 'video', help='Path to video file' ) # Whisper options parser.add_argument( '--transcript', '-t', help='Path to Whisper transcript (JSON or TXT)', default=None ) parser.add_argument( '--run-whisper', action='store_true', help='Run Whisper transcription before processing' ) parser.add_argument( '--whisper-model', choices=['tiny', 'base', 'small', 'medium', 'large'], help='Whisper model to use (default: medium)', default='medium' ) parser.add_argument( '--diarize', action='store_true', help='Use WhisperX with speaker diarization (requires whisperx and HuggingFace token)' ) # Output options parser.add_argument( '--output', '-o', help='Output file for enhanced transcript (default: auto-generated in output directory)', default=None ) parser.add_argument( '--output-dir', help='Base directory for outputs (default: output/)', default='output' ) # Frame extraction options parser.add_argument( '--interval', type=int, help='Extract frame every N seconds (default: 5)', default=5 ) parser.add_argument( '--scene-detection', action='store_true', help='Use scene detection instead of interval extraction' ) parser.add_argument( '--scene-threshold', type=float, help='Scene detection threshold (0-100, lower=more sensitive, default: 15)', default=15.0 ) # Analysis options parser.add_argument( '--ocr-engine', choices=['tesseract', 'easyocr', 'paddleocr'], help='OCR engine to use (default: tesseract)', default='tesseract' ) parser.add_argument( '--use-vision', action='store_true', help='Use local vision model (Ollama) instead of OCR for better context understanding' ) parser.add_argument( '--use-hybrid', action='store_true', help='Use hybrid approach: OpenCV text detection + OCR (more accurate than vision models)' ) parser.add_argument( '--hybrid-llm-cleanup', action='store_true', help='Use LLM to clean up OCR output and preserve code formatting (requires --use-hybrid)' ) parser.add_argument( '--hybrid-llm-model', help='LLM model for cleanup (default: llama3.2:3b)', default='llama3.2:3b' ) parser.add_argument( '--vision-model', help='Vision model to use with Ollama (default: llava:13b)', default='llava:13b' ) parser.add_argument( '--vision-context', choices=['meeting', 'dashboard', 'code', 'console'], help='Context hint for vision analysis (default: meeting)', default='meeting' ) # Processing options parser.add_argument( '--no-cache', action='store_true', help='Disable caching - reprocess everything even if outputs exist' ) parser.add_argument( '--skip-cache-frames', action='store_true', help='Skip cached frames, re-extract from video (but keep whisper/analysis cache)' ) parser.add_argument( '--skip-cache-whisper', action='store_true', help='Skip cached whisper transcript, re-run transcription (but keep frames/analysis cache)' ) parser.add_argument( '--skip-cache-analysis', action='store_true', help='Skip cached analysis, re-run OCR/vision (but keep frames/whisper cache)' ) parser.add_argument( '--no-deduplicate', action='store_true', help='Disable text deduplication' ) parser.add_argument( '--extract-only', action='store_true', help='Only extract frames and analyze, skip transcript merging' ) parser.add_argument( '--format', choices=['detailed', 'compact'], help='Output format style (default: detailed)', default='detailed' ) parser.add_argument( '--embed-images', action='store_true', help='Skip OCR/vision analysis and reference frame files directly (faster, lets LLM analyze images)' ) parser.add_argument( '--embed-quality', type=int, help='JPEG quality for extracted frames (default: 80, lower = smaller files)', default=80 ) # Logging parser.add_argument( '--verbose', '-v', action='store_true', help='Enable verbose logging (DEBUG level)' ) args = parser.parse_args() # Setup logging setup_logging(args.verbose) try: # Create workflow configuration config = WorkflowConfig(**vars(args)) # Run processing workflow workflow = ProcessingWorkflow(config) result = workflow.run() # Print final summary print("\n" + "=" * 80) print("✓ SUCCESS!") print("=" * 80) print(f"Output directory: {result['output_dir']}") if result.get('enhanced_transcript'): print(f"Enhanced transcript ready for AI summarization!") print("=" * 80) return 0 except FileNotFoundError as e: logging.error(f"File not found: {e}") return 1 except RuntimeError as e: logging.error(f"Processing failed: {e}") return 1 except KeyboardInterrupt: logging.warning("\nProcessing interrupted by user") return 130 except Exception as e: logging.exception(f"Unexpected error: {e}") return 1 if __name__ == '__main__': sys.exit(main())