This commit is contained in:
Mariano Gabriel
2025-10-20 00:03:41 -03:00
parent a999bc9093
commit cd7b0aed07
11 changed files with 776 additions and 312 deletions

View File

@@ -1,34 +1,19 @@
#!/usr/bin/env python3
"""
Process meeting recordings to extract audio + screen content.
Combines Whisper transcripts with OCR from screen shares.
Combines Whisper transcripts with vision analysis or OCR from screen shares.
"""
import argparse
from pathlib import Path
import sys
import json
import logging
import subprocess
import shutil
from meetus.frame_extractor import FrameExtractor
from meetus.ocr_processor import OCRProcessor
from meetus.vision_processor import VisionProcessor
from meetus.transcript_merger import TranscriptMerger
logger = logging.getLogger(__name__)
from meetus.workflow import WorkflowConfig, ProcessingWorkflow
def setup_logging(verbose: bool = False):
"""
Configure logging for the application.
Args:
verbose: If True, set DEBUG level, otherwise INFO
"""
"""Configure logging for the application."""
level = logging.DEBUG if verbose else logging.INFO
# Configure root logger
logging.basicConfig(
level=level,
format='%(asctime)s - %(levelname)s - %(message)s',
@@ -41,58 +26,6 @@ def setup_logging(verbose: bool = False):
logging.getLogger('paddleocr').setLevel(logging.WARNING)
def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
"""
Run Whisper transcription on video file.
Args:
video_path: Path to video file
model: Whisper model to use (tiny, base, small, medium, large)
output_dir: Directory to save output
Returns:
Path to generated JSON transcript
"""
# Check if whisper is installed
if not shutil.which("whisper"):
logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
sys.exit(1)
logger.info(f"Running Whisper transcription (model: {model})...")
logger.info("This may take a few minutes depending on video length...")
# Run whisper command
cmd = [
"whisper",
str(video_path),
"--model", model,
"--output_format", "json",
"--output_dir", output_dir
]
try:
result = subprocess.run(
cmd,
check=True,
capture_output=True,
text=True
)
# Whisper outputs to <output_dir>/<video_stem>.json
transcript_path = Path(output_dir) / f"{video_path.stem}.json"
if transcript_path.exists():
logger.info(f"✓ Whisper transcription completed: {transcript_path}")
return transcript_path
else:
logger.error("Whisper completed but transcript file not found")
sys.exit(1)
except subprocess.CalledProcessError as e:
logger.error(f"Whisper failed: {e.stderr}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Extract screen content from meeting recordings and merge with transcripts",
@@ -119,23 +52,23 @@ Examples:
"""
)
# Required arguments
parser.add_argument(
'video',
help='Path to video file'
)
# Whisper options
parser.add_argument(
'--transcript', '-t',
help='Path to Whisper transcript (JSON or TXT)',
default=None
)
parser.add_argument(
'--run-whisper',
action='store_true',
help='Run Whisper transcription before processing'
)
parser.add_argument(
'--whisper-model',
choices=['tiny', 'base', 'small', 'medium', 'large'],
@@ -143,56 +76,48 @@ Examples:
default='base'
)
# Output options
parser.add_argument(
'--output', '-o',
help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
help='Output file for enhanced transcript (default: auto-generated in output directory)',
default=None
)
parser.add_argument(
'--output-dir',
help='Directory for output files (default: output/)',
help='Base directory for outputs (default: output/)',
default='output'
)
parser.add_argument(
'--frames-dir',
help='Directory to save extracted frames (default: frames/)',
default='frames'
)
# Frame extraction options
parser.add_argument(
'--interval',
type=int,
help='Extract frame every N seconds (default: 5)',
default=5
)
parser.add_argument(
'--scene-detection',
action='store_true',
help='Use scene detection instead of interval extraction'
)
# Analysis options
parser.add_argument(
'--ocr-engine',
choices=['tesseract', 'easyocr', 'paddleocr'],
help='OCR engine to use (default: tesseract)',
default='tesseract'
)
parser.add_argument(
'--use-vision',
action='store_true',
help='Use local vision model (Ollama) instead of OCR for better context understanding'
)
parser.add_argument(
'--vision-model',
help='Vision model to use with Ollama (default: llava:13b)',
default='llava:13b'
)
parser.add_argument(
'--vision-context',
choices=['meeting', 'dashboard', 'code', 'console'],
@@ -200,24 +125,22 @@ Examples:
default='meeting'
)
# Processing options
parser.add_argument(
'--no-cache',
action='store_true',
help='Disable caching - reprocess everything even if outputs exist'
)
parser.add_argument(
'--no-deduplicate',
action='store_true',
help='Disable text deduplication'
)
parser.add_argument(
'--extract-only',
action='store_true',
help='Only extract frames and OCR, skip transcript merging'
help='Only extract frames and analyze, skip transcript merging'
)
parser.add_argument(
'--format',
choices=['detailed', 'compact'],
@@ -225,6 +148,7 @@ Examples:
default='detailed'
)
# Logging
parser.add_argument(
'--verbose', '-v',
action='store_true',
@@ -236,166 +160,38 @@ Examples:
# Setup logging
setup_logging(args.verbose)
# Validate video path
video_path = Path(args.video)
if not video_path.exists():
logger.error(f"Video file not found: {args.video}")
sys.exit(1)
try:
# Create workflow configuration
config = WorkflowConfig(**vars(args))
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Run processing workflow
workflow = ProcessingWorkflow(config)
result = workflow.run()
# Set default output path
if args.output is None:
args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
# Print final summary
print("\n" + "=" * 80)
print("✓ SUCCESS!")
print("=" * 80)
print(f"Output directory: {result['output_dir']}")
if result.get('enhanced_transcript'):
print(f"Enhanced transcript ready for AI summarization!")
print("=" * 80)
# Define cache paths
whisper_cache = output_dir / f"{video_path.stem}.json"
analysis_cache = output_dir / f"{video_path.stem}_{'vision' if args.use_vision else 'ocr'}.json"
frames_cache_dir = Path(args.frames_dir)
return 0
# Check for cached Whisper transcript
if args.run_whisper:
if not args.no_cache and whisper_cache.exists():
logger.info(f"✓ Found cached Whisper transcript: {whisper_cache}")
args.transcript = str(whisper_cache)
else:
logger.info("=" * 80)
logger.info("STEP 0: Running Whisper Transcription")
logger.info("=" * 80)
transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
args.transcript = str(transcript_path)
logger.info("")
logger.info("=" * 80)
logger.info("MEETING PROCESSOR")
logger.info("=" * 80)
logger.info(f"Video: {video_path.name}")
logger.info(f"Analysis: {'Vision Model' if args.use_vision else f'OCR ({args.ocr_engine})'}")
if args.use_vision:
logger.info(f"Vision Model: {args.vision_model}")
logger.info(f"Context: {args.vision_context}")
logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
if args.transcript:
logger.info(f"Transcript: {args.transcript}")
logger.info(f"Caching: {'Disabled' if args.no_cache else 'Enabled'}")
logger.info("=" * 80)
# Step 1: Extract frames (with caching)
logger.info("Step 1: Extracting frames from video...")
# Check if frames already exist
existing_frames = list(frames_cache_dir.glob(f"{video_path.stem}_*.jpg")) if frames_cache_dir.exists() else []
if not args.no_cache and existing_frames and len(existing_frames) > 0:
logger.info(f"✓ Found {len(existing_frames)} cached frames in {args.frames_dir}/")
# Build frames_info from existing files
frames_info = []
for frame_path in sorted(existing_frames):
# Try to extract timestamp from filename (e.g., video_00001_12.34s.jpg)
try:
timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
timestamp = float(timestamp_str)
except:
timestamp = 0.0
frames_info.append((str(frame_path), timestamp))
else:
extractor = FrameExtractor(str(video_path), args.frames_dir)
if args.scene_detection:
frames_info = extractor.extract_scene_changes()
else:
frames_info = extractor.extract_by_interval(args.interval)
if not frames_info:
logger.error("No frames extracted")
sys.exit(1)
logger.info(f"✓ Extracted {len(frames_info)} frames")
# Step 2: Run analysis on frames (with caching)
if not args.no_cache and analysis_cache.exists():
logger.info(f"✓ Found cached analysis results: {analysis_cache}")
with open(analysis_cache, 'r', encoding='utf-8') as f:
screen_segments = json.load(f)
logger.info(f"✓ Loaded {len(screen_segments)} analyzed frames from cache")
else:
if args.use_vision:
# Use vision model
logger.info("Step 2: Running vision analysis on extracted frames...")
try:
vision = VisionProcessor(model=args.vision_model)
screen_segments = vision.process_frames(
frames_info,
context=args.vision_context,
deduplicate=not args.no_deduplicate
)
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
except ImportError as e:
logger.error(f"{e}")
sys.exit(1)
else:
# Use OCR
logger.info("Step 2: Running OCR on extracted frames...")
try:
ocr = OCRProcessor(engine=args.ocr_engine)
screen_segments = ocr.process_frames(
frames_info,
deduplicate=not args.no_deduplicate
)
logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
except ImportError as e:
logger.error(f"{e}")
logger.error(f"To install {args.ocr_engine}:")
logger.error(f" pip install {args.ocr_engine}")
sys.exit(1)
# Save analysis results as JSON
with open(analysis_cache, 'w', encoding='utf-8') as f:
json.dump(screen_segments, f, indent=2, ensure_ascii=False)
logger.info(f"✓ Saved analysis results to: {analysis_cache}")
if args.extract_only:
logger.info("Done! (extract-only mode)")
return
# Step 3: Merge with transcript (if provided)
merger = TranscriptMerger()
if args.transcript:
logger.info("Step 3: Merging with Whisper transcript...")
transcript_path = Path(args.transcript)
if not transcript_path.exists():
logger.warning(f"Transcript not found: {args.transcript}")
logger.info("Proceeding with screen content only...")
audio_segments = []
else:
audio_segments = merger.load_whisper_transcript(str(transcript_path))
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
else:
logger.info("No transcript provided, using screen content only...")
audio_segments = []
# Merge and format
merged = merger.merge_transcripts(audio_segments, screen_segments)
formatted = merger.format_for_claude(merged, format_style=args.format)
# Save output
merger.save_transcript(formatted, args.output)
logger.info("=" * 80)
logger.info("✓ PROCESSING COMPLETE!")
logger.info("=" * 80)
logger.info(f"Enhanced transcript: {args.output}")
logger.info(f"OCR data: {ocr_output}")
logger.info(f"Frames: {args.frames_dir}/")
logger.info("")
logger.info("You can now use the enhanced transcript with Claude for summarization!")
except FileNotFoundError as e:
logging.error(f"File not found: {e}")
return 1
except RuntimeError as e:
logging.error(f"Processing failed: {e}")
return 1
except KeyboardInterrupt:
logging.warning("\nProcessing interrupted by user")
return 130
except Exception as e:
logging.exception(f"Unexpected error: {e}")
return 1
if __name__ == '__main__':
main()
sys.exit(main())