refactor
This commit is contained in:
@@ -1,34 +1,19 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Process meeting recordings to extract audio + screen content.
|
||||
Combines Whisper transcripts with OCR from screen shares.
|
||||
Combines Whisper transcripts with vision analysis or OCR from screen shares.
|
||||
"""
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
from meetus.frame_extractor import FrameExtractor
|
||||
from meetus.ocr_processor import OCRProcessor
|
||||
from meetus.vision_processor import VisionProcessor
|
||||
from meetus.transcript_merger import TranscriptMerger
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
from meetus.workflow import WorkflowConfig, ProcessingWorkflow
|
||||
|
||||
|
||||
def setup_logging(verbose: bool = False):
|
||||
"""
|
||||
Configure logging for the application.
|
||||
|
||||
Args:
|
||||
verbose: If True, set DEBUG level, otherwise INFO
|
||||
"""
|
||||
"""Configure logging for the application."""
|
||||
level = logging.DEBUG if verbose else logging.INFO
|
||||
|
||||
# Configure root logger
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
@@ -41,58 +26,6 @@ def setup_logging(verbose: bool = False):
|
||||
logging.getLogger('paddleocr').setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
|
||||
"""
|
||||
Run Whisper transcription on video file.
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
model: Whisper model to use (tiny, base, small, medium, large)
|
||||
output_dir: Directory to save output
|
||||
|
||||
Returns:
|
||||
Path to generated JSON transcript
|
||||
"""
|
||||
# Check if whisper is installed
|
||||
if not shutil.which("whisper"):
|
||||
logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info(f"Running Whisper transcription (model: {model})...")
|
||||
logger.info("This may take a few minutes depending on video length...")
|
||||
|
||||
# Run whisper command
|
||||
cmd = [
|
||||
"whisper",
|
||||
str(video_path),
|
||||
"--model", model,
|
||||
"--output_format", "json",
|
||||
"--output_dir", output_dir
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
# Whisper outputs to <output_dir>/<video_stem>.json
|
||||
transcript_path = Path(output_dir) / f"{video_path.stem}.json"
|
||||
|
||||
if transcript_path.exists():
|
||||
logger.info(f"✓ Whisper transcription completed: {transcript_path}")
|
||||
return transcript_path
|
||||
else:
|
||||
logger.error("Whisper completed but transcript file not found")
|
||||
sys.exit(1)
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Whisper failed: {e.stderr}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract screen content from meeting recordings and merge with transcripts",
|
||||
@@ -119,23 +52,23 @@ Examples:
|
||||
"""
|
||||
)
|
||||
|
||||
# Required arguments
|
||||
parser.add_argument(
|
||||
'video',
|
||||
help='Path to video file'
|
||||
)
|
||||
|
||||
# Whisper options
|
||||
parser.add_argument(
|
||||
'--transcript', '-t',
|
||||
help='Path to Whisper transcript (JSON or TXT)',
|
||||
default=None
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--run-whisper',
|
||||
action='store_true',
|
||||
help='Run Whisper transcription before processing'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--whisper-model',
|
||||
choices=['tiny', 'base', 'small', 'medium', 'large'],
|
||||
@@ -143,56 +76,48 @@ Examples:
|
||||
default='base'
|
||||
)
|
||||
|
||||
# Output options
|
||||
parser.add_argument(
|
||||
'--output', '-o',
|
||||
help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
|
||||
help='Output file for enhanced transcript (default: auto-generated in output directory)',
|
||||
default=None
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output-dir',
|
||||
help='Directory for output files (default: output/)',
|
||||
help='Base directory for outputs (default: output/)',
|
||||
default='output'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--frames-dir',
|
||||
help='Directory to save extracted frames (default: frames/)',
|
||||
default='frames'
|
||||
)
|
||||
|
||||
# Frame extraction options
|
||||
parser.add_argument(
|
||||
'--interval',
|
||||
type=int,
|
||||
help='Extract frame every N seconds (default: 5)',
|
||||
default=5
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--scene-detection',
|
||||
action='store_true',
|
||||
help='Use scene detection instead of interval extraction'
|
||||
)
|
||||
|
||||
# Analysis options
|
||||
parser.add_argument(
|
||||
'--ocr-engine',
|
||||
choices=['tesseract', 'easyocr', 'paddleocr'],
|
||||
help='OCR engine to use (default: tesseract)',
|
||||
default='tesseract'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--use-vision',
|
||||
action='store_true',
|
||||
help='Use local vision model (Ollama) instead of OCR for better context understanding'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--vision-model',
|
||||
help='Vision model to use with Ollama (default: llava:13b)',
|
||||
default='llava:13b'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--vision-context',
|
||||
choices=['meeting', 'dashboard', 'code', 'console'],
|
||||
@@ -200,24 +125,22 @@ Examples:
|
||||
default='meeting'
|
||||
)
|
||||
|
||||
# Processing options
|
||||
parser.add_argument(
|
||||
'--no-cache',
|
||||
action='store_true',
|
||||
help='Disable caching - reprocess everything even if outputs exist'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-deduplicate',
|
||||
action='store_true',
|
||||
help='Disable text deduplication'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--extract-only',
|
||||
action='store_true',
|
||||
help='Only extract frames and OCR, skip transcript merging'
|
||||
help='Only extract frames and analyze, skip transcript merging'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['detailed', 'compact'],
|
||||
@@ -225,6 +148,7 @@ Examples:
|
||||
default='detailed'
|
||||
)
|
||||
|
||||
# Logging
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
@@ -236,166 +160,38 @@ Examples:
|
||||
# Setup logging
|
||||
setup_logging(args.verbose)
|
||||
|
||||
# Validate video path
|
||||
video_path = Path(args.video)
|
||||
if not video_path.exists():
|
||||
logger.error(f"Video file not found: {args.video}")
|
||||
sys.exit(1)
|
||||
try:
|
||||
# Create workflow configuration
|
||||
config = WorkflowConfig(**vars(args))
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Run processing workflow
|
||||
workflow = ProcessingWorkflow(config)
|
||||
result = workflow.run()
|
||||
|
||||
# Set default output path
|
||||
if args.output is None:
|
||||
args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
|
||||
# Print final summary
|
||||
print("\n" + "=" * 80)
|
||||
print("✓ SUCCESS!")
|
||||
print("=" * 80)
|
||||
print(f"Output directory: {result['output_dir']}")
|
||||
if result.get('enhanced_transcript'):
|
||||
print(f"Enhanced transcript ready for AI summarization!")
|
||||
print("=" * 80)
|
||||
|
||||
# Define cache paths
|
||||
whisper_cache = output_dir / f"{video_path.stem}.json"
|
||||
analysis_cache = output_dir / f"{video_path.stem}_{'vision' if args.use_vision else 'ocr'}.json"
|
||||
frames_cache_dir = Path(args.frames_dir)
|
||||
return 0
|
||||
|
||||
# Check for cached Whisper transcript
|
||||
if args.run_whisper:
|
||||
if not args.no_cache and whisper_cache.exists():
|
||||
logger.info(f"✓ Found cached Whisper transcript: {whisper_cache}")
|
||||
args.transcript = str(whisper_cache)
|
||||
else:
|
||||
logger.info("=" * 80)
|
||||
logger.info("STEP 0: Running Whisper Transcription")
|
||||
logger.info("=" * 80)
|
||||
transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
|
||||
args.transcript = str(transcript_path)
|
||||
logger.info("")
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("MEETING PROCESSOR")
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Video: {video_path.name}")
|
||||
logger.info(f"Analysis: {'Vision Model' if args.use_vision else f'OCR ({args.ocr_engine})'}")
|
||||
if args.use_vision:
|
||||
logger.info(f"Vision Model: {args.vision_model}")
|
||||
logger.info(f"Context: {args.vision_context}")
|
||||
logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
|
||||
if args.transcript:
|
||||
logger.info(f"Transcript: {args.transcript}")
|
||||
logger.info(f"Caching: {'Disabled' if args.no_cache else 'Enabled'}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
# Step 1: Extract frames (with caching)
|
||||
logger.info("Step 1: Extracting frames from video...")
|
||||
|
||||
# Check if frames already exist
|
||||
existing_frames = list(frames_cache_dir.glob(f"{video_path.stem}_*.jpg")) if frames_cache_dir.exists() else []
|
||||
|
||||
if not args.no_cache and existing_frames and len(existing_frames) > 0:
|
||||
logger.info(f"✓ Found {len(existing_frames)} cached frames in {args.frames_dir}/")
|
||||
# Build frames_info from existing files
|
||||
frames_info = []
|
||||
for frame_path in sorted(existing_frames):
|
||||
# Try to extract timestamp from filename (e.g., video_00001_12.34s.jpg)
|
||||
try:
|
||||
timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
|
||||
timestamp = float(timestamp_str)
|
||||
except:
|
||||
timestamp = 0.0
|
||||
frames_info.append((str(frame_path), timestamp))
|
||||
else:
|
||||
extractor = FrameExtractor(str(video_path), args.frames_dir)
|
||||
|
||||
if args.scene_detection:
|
||||
frames_info = extractor.extract_scene_changes()
|
||||
else:
|
||||
frames_info = extractor.extract_by_interval(args.interval)
|
||||
|
||||
if not frames_info:
|
||||
logger.error("No frames extracted")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info(f"✓ Extracted {len(frames_info)} frames")
|
||||
|
||||
# Step 2: Run analysis on frames (with caching)
|
||||
if not args.no_cache and analysis_cache.exists():
|
||||
logger.info(f"✓ Found cached analysis results: {analysis_cache}")
|
||||
with open(analysis_cache, 'r', encoding='utf-8') as f:
|
||||
screen_segments = json.load(f)
|
||||
logger.info(f"✓ Loaded {len(screen_segments)} analyzed frames from cache")
|
||||
else:
|
||||
if args.use_vision:
|
||||
# Use vision model
|
||||
logger.info("Step 2: Running vision analysis on extracted frames...")
|
||||
try:
|
||||
vision = VisionProcessor(model=args.vision_model)
|
||||
screen_segments = vision.process_frames(
|
||||
frames_info,
|
||||
context=args.vision_context,
|
||||
deduplicate=not args.no_deduplicate
|
||||
)
|
||||
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"{e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Use OCR
|
||||
logger.info("Step 2: Running OCR on extracted frames...")
|
||||
try:
|
||||
ocr = OCRProcessor(engine=args.ocr_engine)
|
||||
screen_segments = ocr.process_frames(
|
||||
frames_info,
|
||||
deduplicate=not args.no_deduplicate
|
||||
)
|
||||
logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"{e}")
|
||||
logger.error(f"To install {args.ocr_engine}:")
|
||||
logger.error(f" pip install {args.ocr_engine}")
|
||||
sys.exit(1)
|
||||
|
||||
# Save analysis results as JSON
|
||||
with open(analysis_cache, 'w', encoding='utf-8') as f:
|
||||
json.dump(screen_segments, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"✓ Saved analysis results to: {analysis_cache}")
|
||||
|
||||
if args.extract_only:
|
||||
logger.info("Done! (extract-only mode)")
|
||||
return
|
||||
|
||||
# Step 3: Merge with transcript (if provided)
|
||||
merger = TranscriptMerger()
|
||||
|
||||
if args.transcript:
|
||||
logger.info("Step 3: Merging with Whisper transcript...")
|
||||
transcript_path = Path(args.transcript)
|
||||
|
||||
if not transcript_path.exists():
|
||||
logger.warning(f"Transcript not found: {args.transcript}")
|
||||
logger.info("Proceeding with screen content only...")
|
||||
audio_segments = []
|
||||
else:
|
||||
audio_segments = merger.load_whisper_transcript(str(transcript_path))
|
||||
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
|
||||
else:
|
||||
logger.info("No transcript provided, using screen content only...")
|
||||
audio_segments = []
|
||||
|
||||
# Merge and format
|
||||
merged = merger.merge_transcripts(audio_segments, screen_segments)
|
||||
formatted = merger.format_for_claude(merged, format_style=args.format)
|
||||
|
||||
# Save output
|
||||
merger.save_transcript(formatted, args.output)
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("✓ PROCESSING COMPLETE!")
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Enhanced transcript: {args.output}")
|
||||
logger.info(f"OCR data: {ocr_output}")
|
||||
logger.info(f"Frames: {args.frames_dir}/")
|
||||
logger.info("")
|
||||
logger.info("You can now use the enhanced transcript with Claude for summarization!")
|
||||
except FileNotFoundError as e:
|
||||
logging.error(f"File not found: {e}")
|
||||
return 1
|
||||
except RuntimeError as e:
|
||||
logging.error(f"Processing failed: {e}")
|
||||
return 1
|
||||
except KeyboardInterrupt:
|
||||
logging.warning("\nProcessing interrupted by user")
|
||||
return 130
|
||||
except Exception as e:
|
||||
logging.exception(f"Unexpected error: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
sys.exit(main())
|
||||
|
||||
Reference in New Issue
Block a user