198 lines
5.6 KiB
Python
198 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Process meeting recordings to extract audio + screen content.
|
|
Combines Whisper transcripts with vision analysis or OCR from screen shares.
|
|
"""
|
|
import argparse
|
|
import sys
|
|
import logging
|
|
|
|
from meetus.workflow import WorkflowConfig, ProcessingWorkflow
|
|
|
|
|
|
def setup_logging(verbose: bool = False):
|
|
"""Configure logging for the application."""
|
|
level = logging.DEBUG if verbose else logging.INFO
|
|
|
|
logging.basicConfig(
|
|
level=level,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%H:%M:%S'
|
|
)
|
|
|
|
# Suppress verbose output from libraries
|
|
logging.getLogger('PIL').setLevel(logging.WARNING)
|
|
logging.getLogger('easyocr').setLevel(logging.WARNING)
|
|
logging.getLogger('paddleocr').setLevel(logging.WARNING)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract screen content from meeting recordings and merge with transcripts",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Run Whisper + vision analysis (recommended for code/dashboards)
|
|
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision
|
|
|
|
# Use vision with specific context hint
|
|
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --vision-context code
|
|
|
|
# Traditional OCR approach
|
|
python process_meeting.py samples/meeting.mkv --run-whisper
|
|
|
|
# Re-run analysis using cached frames and transcript
|
|
python process_meeting.py samples/meeting.mkv --use-vision
|
|
|
|
# Force reprocessing (ignore cache)
|
|
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --no-cache
|
|
|
|
# Use scene detection for fewer frames
|
|
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --scene-detection
|
|
"""
|
|
)
|
|
|
|
# Required arguments
|
|
parser.add_argument(
|
|
'video',
|
|
help='Path to video file'
|
|
)
|
|
|
|
# Whisper options
|
|
parser.add_argument(
|
|
'--transcript', '-t',
|
|
help='Path to Whisper transcript (JSON or TXT)',
|
|
default=None
|
|
)
|
|
parser.add_argument(
|
|
'--run-whisper',
|
|
action='store_true',
|
|
help='Run Whisper transcription before processing'
|
|
)
|
|
parser.add_argument(
|
|
'--whisper-model',
|
|
choices=['tiny', 'base', 'small', 'medium', 'large'],
|
|
help='Whisper model to use (default: base)',
|
|
default='base'
|
|
)
|
|
|
|
# Output options
|
|
parser.add_argument(
|
|
'--output', '-o',
|
|
help='Output file for enhanced transcript (default: auto-generated in output directory)',
|
|
default=None
|
|
)
|
|
parser.add_argument(
|
|
'--output-dir',
|
|
help='Base directory for outputs (default: output/)',
|
|
default='output'
|
|
)
|
|
|
|
# Frame extraction options
|
|
parser.add_argument(
|
|
'--interval',
|
|
type=int,
|
|
help='Extract frame every N seconds (default: 5)',
|
|
default=5
|
|
)
|
|
parser.add_argument(
|
|
'--scene-detection',
|
|
action='store_true',
|
|
help='Use scene detection instead of interval extraction'
|
|
)
|
|
|
|
# Analysis options
|
|
parser.add_argument(
|
|
'--ocr-engine',
|
|
choices=['tesseract', 'easyocr', 'paddleocr'],
|
|
help='OCR engine to use (default: tesseract)',
|
|
default='tesseract'
|
|
)
|
|
parser.add_argument(
|
|
'--use-vision',
|
|
action='store_true',
|
|
help='Use local vision model (Ollama) instead of OCR for better context understanding'
|
|
)
|
|
parser.add_argument(
|
|
'--vision-model',
|
|
help='Vision model to use with Ollama (default: llava:13b)',
|
|
default='llava:13b'
|
|
)
|
|
parser.add_argument(
|
|
'--vision-context',
|
|
choices=['meeting', 'dashboard', 'code', 'console'],
|
|
help='Context hint for vision analysis (default: meeting)',
|
|
default='meeting'
|
|
)
|
|
|
|
# Processing options
|
|
parser.add_argument(
|
|
'--no-cache',
|
|
action='store_true',
|
|
help='Disable caching - reprocess everything even if outputs exist'
|
|
)
|
|
parser.add_argument(
|
|
'--no-deduplicate',
|
|
action='store_true',
|
|
help='Disable text deduplication'
|
|
)
|
|
parser.add_argument(
|
|
'--extract-only',
|
|
action='store_true',
|
|
help='Only extract frames and analyze, skip transcript merging'
|
|
)
|
|
parser.add_argument(
|
|
'--format',
|
|
choices=['detailed', 'compact'],
|
|
help='Output format style (default: detailed)',
|
|
default='detailed'
|
|
)
|
|
|
|
# Logging
|
|
parser.add_argument(
|
|
'--verbose', '-v',
|
|
action='store_true',
|
|
help='Enable verbose logging (DEBUG level)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Setup logging
|
|
setup_logging(args.verbose)
|
|
|
|
try:
|
|
# Create workflow configuration
|
|
config = WorkflowConfig(**vars(args))
|
|
|
|
# Run processing workflow
|
|
workflow = ProcessingWorkflow(config)
|
|
result = workflow.run()
|
|
|
|
# Print final summary
|
|
print("\n" + "=" * 80)
|
|
print("✓ SUCCESS!")
|
|
print("=" * 80)
|
|
print(f"Output directory: {result['output_dir']}")
|
|
if result.get('enhanced_transcript'):
|
|
print(f"Enhanced transcript ready for AI summarization!")
|
|
print("=" * 80)
|
|
|
|
return 0
|
|
|
|
except FileNotFoundError as e:
|
|
logging.error(f"File not found: {e}")
|
|
return 1
|
|
except RuntimeError as e:
|
|
logging.error(f"Processing failed: {e}")
|
|
return 1
|
|
except KeyboardInterrupt:
|
|
logging.warning("\nProcessing interrupted by user")
|
|
return 130
|
|
except Exception as e:
|
|
logging.exception(f"Unexpected error: {e}")
|
|
return 1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|