refactor

2025-10-20 00:03:41 -03:00
parent a999bc9093
commit cd7b0aed07
11 changed files with 776 additions and 312 deletions
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -1,34 +1,19 @@
 #!/usr/bin/env python3
 """
 Process meeting recordings to extract audio + screen content.
-Combines Whisper transcripts with OCR from screen shares.
+Combines Whisper transcripts with vision analysis or OCR from screen shares.
 """
 import argparse
-from pathlib import Path
 import sys
-import json
 import logging
-import subprocess
-import shutil

-from meetus.frame_extractor import FrameExtractor
-from meetus.ocr_processor import OCRProcessor
-from meetus.vision_processor import VisionProcessor
-from meetus.transcript_merger import TranscriptMerger
-
-logger = logging.getLogger(__name__)
+from meetus.workflow import WorkflowConfig, ProcessingWorkflow


 def setup_logging(verbose: bool = False):
-    """
-    Configure logging for the application.
-
-    Args:
-        verbose: If True, set DEBUG level, otherwise INFO
-    """
+    """Configure logging for the application."""
    level = logging.DEBUG if verbose else logging.INFO

-    # Configure root logger
    logging.basicConfig(
        level=level,
        format='%(asctime)s - %(levelname)s - %(message)s',
@@ -41,58 +26,6 @@ def setup_logging(verbose: bool = False):
    logging.getLogger('paddleocr').setLevel(logging.WARNING)


-def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
-    """
-    Run Whisper transcription on video file.
-
-    Args:
-        video_path: Path to video file
-        model: Whisper model to use (tiny, base, small, medium, large)
-        output_dir: Directory to save output
-
-    Returns:
-        Path to generated JSON transcript
-    """
-    # Check if whisper is installed
-    if not shutil.which("whisper"):
-        logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
-        sys.exit(1)
-
-    logger.info(f"Running Whisper transcription (model: {model})...")
-    logger.info("This may take a few minutes depending on video length...")
-
-    # Run whisper command
-    cmd = [
-        "whisper",
-        str(video_path),
-        "--model", model,
-        "--output_format", "json",
-        "--output_dir", output_dir
-    ]
-
-    try:
-        result = subprocess.run(
-            cmd,
-            check=True,
-            capture_output=True,
-            text=True
-        )
-
-        # Whisper outputs to <output_dir>/<video_stem>.json
-        transcript_path = Path(output_dir) / f"{video_path.stem}.json"
-
-        if transcript_path.exists():
-            logger.info(f"✓ Whisper transcription completed: {transcript_path}")
-            return transcript_path
-        else:
-            logger.error("Whisper completed but transcript file not found")
-            sys.exit(1)
-
-    except subprocess.CalledProcessError as e:
-        logger.error(f"Whisper failed: {e.stderr}")
-        sys.exit(1)
-
-
 def main():
    parser = argparse.ArgumentParser(
        description="Extract screen content from meeting recordings and merge with transcripts",
@@ -119,23 +52,23 @@ Examples:
        """
    )

+    # Required arguments
    parser.add_argument(
        'video',
        help='Path to video file'
    )

+    # Whisper options
    parser.add_argument(
        '--transcript', '-t',
        help='Path to Whisper transcript (JSON or TXT)',
        default=None
    )
-
    parser.add_argument(
        '--run-whisper',
        action='store_true',
        help='Run Whisper transcription before processing'
    )
-
    parser.add_argument(
        '--whisper-model',
        choices=['tiny', 'base', 'small', 'medium', 'large'],
@@ -143,56 +76,48 @@ Examples:
        default='base'
    )

+    # Output options
    parser.add_argument(
        '--output', '-o',
-        help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
+        help='Output file for enhanced transcript (default: auto-generated in output directory)',
        default=None
    )
-
    parser.add_argument(
        '--output-dir',
-        help='Directory for output files (default: output/)',
+        help='Base directory for outputs (default: output/)',
        default='output'
    )

-    parser.add_argument(
-        '--frames-dir',
-        help='Directory to save extracted frames (default: frames/)',
-        default='frames'
-    )
-
+    # Frame extraction options
    parser.add_argument(
        '--interval',
        type=int,
        help='Extract frame every N seconds (default: 5)',
        default=5
    )
-
    parser.add_argument(
        '--scene-detection',
        action='store_true',
        help='Use scene detection instead of interval extraction'
    )

+    # Analysis options
    parser.add_argument(
        '--ocr-engine',
        choices=['tesseract', 'easyocr', 'paddleocr'],
        help='OCR engine to use (default: tesseract)',
        default='tesseract'
    )
-
    parser.add_argument(
        '--use-vision',
        action='store_true',
        help='Use local vision model (Ollama) instead of OCR for better context understanding'
    )
-
    parser.add_argument(
        '--vision-model',
        help='Vision model to use with Ollama (default: llava:13b)',
        default='llava:13b'
    )
-
    parser.add_argument(
        '--vision-context',
        choices=['meeting', 'dashboard', 'code', 'console'],
@@ -200,24 +125,22 @@ Examples:
        default='meeting'
    )

+    # Processing options
    parser.add_argument(
        '--no-cache',
        action='store_true',
        help='Disable caching - reprocess everything even if outputs exist'
    )
-
    parser.add_argument(
        '--no-deduplicate',
        action='store_true',
        help='Disable text deduplication'
    )
-
    parser.add_argument(
        '--extract-only',
        action='store_true',
-        help='Only extract frames and OCR, skip transcript merging'
+        help='Only extract frames and analyze, skip transcript merging'
    )
-
    parser.add_argument(
        '--format',
        choices=['detailed', 'compact'],
@@ -225,6 +148,7 @@ Examples:
        default='detailed'
    )

+    # Logging
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
@@ -236,166 +160,38 @@ Examples:
    # Setup logging
    setup_logging(args.verbose)

-    # Validate video path
-    video_path = Path(args.video)
-    if not video_path.exists():
-        logger.error(f"Video file not found: {args.video}")
-        sys.exit(1)
+    try:
+        # Create workflow configuration
+        config = WorkflowConfig(**vars(args))

-    # Create output directory
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
+        # Run processing workflow
+        workflow = ProcessingWorkflow(config)
+        result = workflow.run()

-    # Set default output path
-    if args.output is None:
-        args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
+        # Print final summary
+        print("\n" + "=" * 80)
+        print("✓ SUCCESS!")
+        print("=" * 80)
+        print(f"Output directory: {result['output_dir']}")
+        if result.get('enhanced_transcript'):
+            print(f"Enhanced transcript ready for AI summarization!")
+        print("=" * 80)

-    # Define cache paths
-    whisper_cache = output_dir / f"{video_path.stem}.json"
-    analysis_cache = output_dir / f"{video_path.stem}_{'vision' if args.use_vision else 'ocr'}.json"
-    frames_cache_dir = Path(args.frames_dir)
+        return 0

-    # Check for cached Whisper transcript
-    if args.run_whisper:
-        if not args.no_cache and whisper_cache.exists():
-            logger.info(f"✓ Found cached Whisper transcript: {whisper_cache}")
-            args.transcript = str(whisper_cache)
-        else:
-            logger.info("=" * 80)
-            logger.info("STEP 0: Running Whisper Transcription")
-            logger.info("=" * 80)
-            transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
-            args.transcript = str(transcript_path)
-            logger.info("")
-
-    logger.info("=" * 80)
-    logger.info("MEETING PROCESSOR")
-    logger.info("=" * 80)
-    logger.info(f"Video: {video_path.name}")
-    logger.info(f"Analysis: {'Vision Model' if args.use_vision else f'OCR ({args.ocr_engine})'}")
-    if args.use_vision:
-        logger.info(f"Vision Model: {args.vision_model}")
-        logger.info(f"Context: {args.vision_context}")
-    logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
-    if args.transcript:
-        logger.info(f"Transcript: {args.transcript}")
-    logger.info(f"Caching: {'Disabled' if args.no_cache else 'Enabled'}")
-    logger.info("=" * 80)
-
-    # Step 1: Extract frames (with caching)
-    logger.info("Step 1: Extracting frames from video...")
-
-    # Check if frames already exist
-    existing_frames = list(frames_cache_dir.glob(f"{video_path.stem}_*.jpg")) if frames_cache_dir.exists() else []
-
-    if not args.no_cache and existing_frames and len(existing_frames) > 0:
-        logger.info(f"✓ Found {len(existing_frames)} cached frames in {args.frames_dir}/")
-        # Build frames_info from existing files
-        frames_info = []
-        for frame_path in sorted(existing_frames):
-            # Try to extract timestamp from filename (e.g., video_00001_12.34s.jpg)
-            try:
-                timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
-                timestamp = float(timestamp_str)
-            except:
-                timestamp = 0.0
-            frames_info.append((str(frame_path), timestamp))
-    else:
-        extractor = FrameExtractor(str(video_path), args.frames_dir)
-
-        if args.scene_detection:
-            frames_info = extractor.extract_scene_changes()
-        else:
-            frames_info = extractor.extract_by_interval(args.interval)
-
-        if not frames_info:
-            logger.error("No frames extracted")
-            sys.exit(1)
-
-        logger.info(f"✓ Extracted {len(frames_info)} frames")
-
-    # Step 2: Run analysis on frames (with caching)
-    if not args.no_cache and analysis_cache.exists():
-        logger.info(f"✓ Found cached analysis results: {analysis_cache}")
-        with open(analysis_cache, 'r', encoding='utf-8') as f:
-            screen_segments = json.load(f)
-        logger.info(f"✓ Loaded {len(screen_segments)} analyzed frames from cache")
-    else:
-        if args.use_vision:
-            # Use vision model
-            logger.info("Step 2: Running vision analysis on extracted frames...")
-            try:
-                vision = VisionProcessor(model=args.vision_model)
-                screen_segments = vision.process_frames(
-                    frames_info,
-                    context=args.vision_context,
-                    deduplicate=not args.no_deduplicate
-                )
-                logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
-
-            except ImportError as e:
-                logger.error(f"{e}")
-                sys.exit(1)
-        else:
-            # Use OCR
-            logger.info("Step 2: Running OCR on extracted frames...")
-            try:
-                ocr = OCRProcessor(engine=args.ocr_engine)
-                screen_segments = ocr.process_frames(
-                    frames_info,
-                    deduplicate=not args.no_deduplicate
-                )
-                logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
-
-            except ImportError as e:
-                logger.error(f"{e}")
-                logger.error(f"To install {args.ocr_engine}:")
-                logger.error(f"  pip install {args.ocr_engine}")
-                sys.exit(1)
-
-        # Save analysis results as JSON
-        with open(analysis_cache, 'w', encoding='utf-8') as f:
-            json.dump(screen_segments, f, indent=2, ensure_ascii=False)
-        logger.info(f"✓ Saved analysis results to: {analysis_cache}")
-
-    if args.extract_only:
-        logger.info("Done! (extract-only mode)")
-        return
-
-    # Step 3: Merge with transcript (if provided)
-    merger = TranscriptMerger()
-
-    if args.transcript:
-        logger.info("Step 3: Merging with Whisper transcript...")
-        transcript_path = Path(args.transcript)
-
-        if not transcript_path.exists():
-            logger.warning(f"Transcript not found: {args.transcript}")
-            logger.info("Proceeding with screen content only...")
-            audio_segments = []
-        else:
-            audio_segments = merger.load_whisper_transcript(str(transcript_path))
-            logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
-    else:
-        logger.info("No transcript provided, using screen content only...")
-        audio_segments = []
-
-    # Merge and format
-    merged = merger.merge_transcripts(audio_segments, screen_segments)
-    formatted = merger.format_for_claude(merged, format_style=args.format)
-
-    # Save output
-    merger.save_transcript(formatted, args.output)
-
-    logger.info("=" * 80)
-    logger.info("✓ PROCESSING COMPLETE!")
-    logger.info("=" * 80)
-    logger.info(f"Enhanced transcript: {args.output}")
-    logger.info(f"OCR data: {ocr_output}")
-    logger.info(f"Frames: {args.frames_dir}/")
-    logger.info("")
-    logger.info("You can now use the enhanced transcript with Claude for summarization!")
+    except FileNotFoundError as e:
+        logging.error(f"File not found: {e}")
+        return 1
+    except RuntimeError as e:
+        logging.error(f"Processing failed: {e}")
+        return 1
+    except KeyboardInterrupt:
+        logging.warning("\nProcessing interrupted by user")
+        return 130
+    except Exception as e:
+        logging.exception(f"Unexpected error: {e}")
+        return 1


 if __name__ == '__main__':
-    main()
+    sys.exit(main())