add whisper to main command, ignore output files

This commit is contained in:
Mariano Gabriel
2025-10-19 22:49:36 -03:00
parent 93e0c06d38
commit ae89564373
5 changed files with 183 additions and 50 deletions

View File

@@ -8,6 +8,8 @@ from pathlib import Path
import sys
import json
import logging
import subprocess
import shutil
from meetus.frame_extractor import FrameExtractor
from meetus.ocr_processor import OCRProcessor
@@ -38,23 +40,78 @@ def setup_logging(verbose: bool = False):
logging.getLogger('paddleocr').setLevel(logging.WARNING)
def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
"""
Run Whisper transcription on video file.
Args:
video_path: Path to video file
model: Whisper model to use (tiny, base, small, medium, large)
output_dir: Directory to save output
Returns:
Path to generated JSON transcript
"""
# Check if whisper is installed
if not shutil.which("whisper"):
logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
sys.exit(1)
logger.info(f"Running Whisper transcription (model: {model})...")
logger.info("This may take a few minutes depending on video length...")
# Run whisper command
cmd = [
"whisper",
str(video_path),
"--model", model,
"--output_format", "json",
"--output_dir", output_dir
]
try:
result = subprocess.run(
cmd,
check=True,
capture_output=True,
text=True
)
# Whisper outputs to <output_dir>/<video_stem>.json
transcript_path = Path(output_dir) / f"{video_path.stem}.json"
if transcript_path.exists():
logger.info(f"✓ Whisper transcription completed: {transcript_path}")
return transcript_path
else:
logger.error("Whisper completed but transcript file not found")
sys.exit(1)
except subprocess.CalledProcessError as e:
logger.error(f"Whisper failed: {e.stderr}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Extract screen content from meeting recordings and merge with transcripts",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Process video and extract frames only
python process_meeting.py samples/meeting.mkv --extract-only
# Run Whisper + full processing in one command
python process_meeting.py samples/meeting.mkv --run-whisper
# Process video with Whisper transcript
python process_meeting.py samples/meeting.mkv --transcript meeting.json
# Process video with existing Whisper transcript
python process_meeting.py samples/meeting.mkv --transcript output/meeting.json
# Use scene detection instead of interval
python process_meeting.py samples/meeting.mkv --scene-detection
python process_meeting.py samples/meeting.mkv --run-whisper --scene-detection
# Use different OCR engine
python process_meeting.py samples/meeting.mkv --ocr-engine easyocr
# Use different Whisper model and OCR engine
python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model small --ocr-engine easyocr
# Extract frames only (no transcript)
python process_meeting.py samples/meeting.mkv --extract-only
"""
)
@@ -69,12 +126,31 @@ Examples:
default=None
)
parser.add_argument(
'--run-whisper',
action='store_true',
help='Run Whisper transcription before processing'
)
parser.add_argument(
'--whisper-model',
choices=['tiny', 'base', 'small', 'medium', 'large'],
help='Whisper model to use (default: base)',
default='base'
)
parser.add_argument(
'--output', '-o',
help='Output file for enhanced transcript (default: <video>_enhanced.txt)',
help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
default=None
)
parser.add_argument(
'--output-dir',
help='Directory for output files (default: output/)',
default='output'
)
parser.add_argument(
'--frames-dir',
help='Directory to save extracted frames (default: frames/)',
@@ -137,9 +213,22 @@ Examples:
logger.error(f"Video file not found: {args.video}")
sys.exit(1)
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Set default output path
if args.output is None:
args.output = video_path.stem + '_enhanced.txt'
args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
# Run Whisper if requested
if args.run_whisper:
logger.info("=" * 80)
logger.info("STEP 0: Running Whisper Transcription")
logger.info("=" * 80)
transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
args.transcript = str(transcript_path)
logger.info("")
logger.info("=" * 80)
logger.info("MEETING PROCESSOR")
@@ -147,6 +236,8 @@ Examples:
logger.info(f"Video: {video_path.name}")
logger.info(f"OCR Engine: {args.ocr_engine}")
logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
if args.transcript:
logger.info(f"Transcript: {args.transcript}")
logger.info("=" * 80)
# Step 1: Extract frames
@@ -181,7 +272,7 @@ Examples:
sys.exit(1)
# Save OCR results as JSON
ocr_output = video_path.stem + '_ocr.json'
ocr_output = output_dir / f"{video_path.stem}_ocr.json"
with open(ocr_output, 'w', encoding='utf-8') as f:
json.dump(screen_segments, f, indent=2, ensure_ascii=False)
logger.info(f"✓ Saved OCR results to: {ocr_output}")