init commit

This commit is contained in:
Mariano Gabriel
2025-10-19 22:17:38 -03:00
commit 93e0c06d38
10 changed files with 969 additions and 0 deletions

229
process_meeting.py Normal file
View File

@@ -0,0 +1,229 @@
#!/usr/bin/env python3
"""
Process meeting recordings to extract audio + screen content.
Combines Whisper transcripts with OCR from screen shares.
"""
import argparse
from pathlib import Path
import sys
import json
import logging
from meetus.frame_extractor import FrameExtractor
from meetus.ocr_processor import OCRProcessor
from meetus.transcript_merger import TranscriptMerger
logger = logging.getLogger(__name__)
def setup_logging(verbose: bool = False):
"""
Configure logging for the application.
Args:
verbose: If True, set DEBUG level, otherwise INFO
"""
level = logging.DEBUG if verbose else logging.INFO
# Configure root logger
logging.basicConfig(
level=level,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S'
)
# Suppress verbose output from libraries
logging.getLogger('PIL').setLevel(logging.WARNING)
logging.getLogger('easyocr').setLevel(logging.WARNING)
logging.getLogger('paddleocr').setLevel(logging.WARNING)
def main():
parser = argparse.ArgumentParser(
description="Extract screen content from meeting recordings and merge with transcripts",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Process video and extract frames only
python process_meeting.py samples/meeting.mkv --extract-only
# Process video with Whisper transcript
python process_meeting.py samples/meeting.mkv --transcript meeting.json
# Use scene detection instead of interval
python process_meeting.py samples/meeting.mkv --scene-detection
# Use different OCR engine
python process_meeting.py samples/meeting.mkv --ocr-engine easyocr
"""
)
parser.add_argument(
'video',
help='Path to video file'
)
parser.add_argument(
'--transcript', '-t',
help='Path to Whisper transcript (JSON or TXT)',
default=None
)
parser.add_argument(
'--output', '-o',
help='Output file for enhanced transcript (default: <video>_enhanced.txt)',
default=None
)
parser.add_argument(
'--frames-dir',
help='Directory to save extracted frames (default: frames/)',
default='frames'
)
parser.add_argument(
'--interval',
type=int,
help='Extract frame every N seconds (default: 5)',
default=5
)
parser.add_argument(
'--scene-detection',
action='store_true',
help='Use scene detection instead of interval extraction'
)
parser.add_argument(
'--ocr-engine',
choices=['tesseract', 'easyocr', 'paddleocr'],
help='OCR engine to use (default: tesseract)',
default='tesseract'
)
parser.add_argument(
'--no-deduplicate',
action='store_true',
help='Disable text deduplication'
)
parser.add_argument(
'--extract-only',
action='store_true',
help='Only extract frames and OCR, skip transcript merging'
)
parser.add_argument(
'--format',
choices=['detailed', 'compact'],
help='Output format style (default: detailed)',
default='detailed'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Enable verbose logging (DEBUG level)'
)
args = parser.parse_args()
# Setup logging
setup_logging(args.verbose)
# Validate video path
video_path = Path(args.video)
if not video_path.exists():
logger.error(f"Video file not found: {args.video}")
sys.exit(1)
# Set default output path
if args.output is None:
args.output = video_path.stem + '_enhanced.txt'
logger.info("=" * 80)
logger.info("MEETING PROCESSOR")
logger.info("=" * 80)
logger.info(f"Video: {video_path.name}")
logger.info(f"OCR Engine: {args.ocr_engine}")
logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
logger.info("=" * 80)
# Step 1: Extract frames
logger.info("Step 1: Extracting frames from video...")
extractor = FrameExtractor(str(video_path), args.frames_dir)
if args.scene_detection:
frames_info = extractor.extract_scene_changes()
else:
frames_info = extractor.extract_by_interval(args.interval)
if not frames_info:
logger.error("No frames extracted")
sys.exit(1)
logger.info(f"✓ Extracted {len(frames_info)} frames")
# Step 2: Run OCR on frames
logger.info("Step 2: Running OCR on extracted frames...")
try:
ocr = OCRProcessor(engine=args.ocr_engine)
screen_segments = ocr.process_frames(
frames_info,
deduplicate=not args.no_deduplicate
)
logger.info(f"✓ Processed {len(screen_segments)} frames with text content")
except ImportError as e:
logger.error(f"{e}")
logger.error(f"To install {args.ocr_engine}:")
logger.error(f" pip install {args.ocr_engine}")
sys.exit(1)
# Save OCR results as JSON
ocr_output = video_path.stem + '_ocr.json'
with open(ocr_output, 'w', encoding='utf-8') as f:
json.dump(screen_segments, f, indent=2, ensure_ascii=False)
logger.info(f"✓ Saved OCR results to: {ocr_output}")
if args.extract_only:
logger.info("Done! (extract-only mode)")
return
# Step 3: Merge with transcript (if provided)
merger = TranscriptMerger()
if args.transcript:
logger.info("Step 3: Merging with Whisper transcript...")
transcript_path = Path(args.transcript)
if not transcript_path.exists():
logger.warning(f"Transcript not found: {args.transcript}")
logger.info("Proceeding with screen content only...")
audio_segments = []
else:
audio_segments = merger.load_whisper_transcript(str(transcript_path))
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
else:
logger.info("No transcript provided, using screen content only...")
audio_segments = []
# Merge and format
merged = merger.merge_transcripts(audio_segments, screen_segments)
formatted = merger.format_for_claude(merged, format_style=args.format)
# Save output
merger.save_transcript(formatted, args.output)
logger.info("=" * 80)
logger.info("✓ PROCESSING COMPLETE!")
logger.info("=" * 80)
logger.info(f"Enhanced transcript: {args.output}")
logger.info(f"OCR data: {ocr_output}")
logger.info(f"Frames: {args.frames_dir}/")
logger.info("")
logger.info("You can now use the enhanced transcript with Claude for summarization!")
if __name__ == '__main__':
main()