add whisper to main command, ignore output files
This commit is contained in:
11
.gitignore
vendored
11
.gitignore
vendored
@@ -1,2 +1,11 @@
|
|||||||
!samples/.gitkeep
|
# Sample videos
|
||||||
samples/*
|
samples/*
|
||||||
|
!samples/.gitkeep
|
||||||
|
|
||||||
|
# Output files
|
||||||
|
output/*
|
||||||
|
!output/.gitkeep
|
||||||
|
|
||||||
|
# Extracted frames
|
||||||
|
frames/
|
||||||
|
__pycache__
|
||||||
111
README.md
111
README.md
@@ -41,7 +41,13 @@ brew install ffmpeg
|
|||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3. Optional: Install Alternative OCR Engines
|
### 3. Whisper (for audio transcription)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install openai-whisper
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Optional: Install Alternative OCR Engines
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# EasyOCR (better for rotated/handwritten text)
|
# EasyOCR (better for rotated/handwritten text)
|
||||||
@@ -53,52 +59,67 @@ pip install paddleocr
|
|||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
### Basic Usage (Screen Content Only)
|
### Recommended: Run Everything in One Command
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python process_meeting.py samples/meeting.mkv --run-whisper
|
||||||
|
```
|
||||||
|
|
||||||
|
This will:
|
||||||
|
1. Run Whisper transcription (audio → text)
|
||||||
|
2. Extract frames every 5 seconds
|
||||||
|
3. Run OCR to extract screen text
|
||||||
|
4. Merge audio + screen content
|
||||||
|
5. Save everything to `output/` folder
|
||||||
|
|
||||||
|
### Alternative: Use Existing Whisper Transcript
|
||||||
|
|
||||||
|
If you already have a Whisper transcript:
|
||||||
|
```bash
|
||||||
|
python process_meeting.py samples/meeting.mkv --transcript output/meeting.json
|
||||||
|
```
|
||||||
|
|
||||||
|
### Screen Content Only (No Audio)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python process_meeting.py samples/meeting.mkv
|
python process_meeting.py samples/meeting.mkv
|
||||||
```
|
```
|
||||||
|
|
||||||
This will:
|
|
||||||
1. Extract frames every 5 seconds
|
|
||||||
2. Run OCR to extract screen text
|
|
||||||
3. Save enhanced transcript to `meeting_enhanced.txt`
|
|
||||||
|
|
||||||
### With Whisper Transcript
|
|
||||||
|
|
||||||
First, generate a Whisper transcript:
|
|
||||||
```bash
|
|
||||||
whisper samples/meeting.mkv --model base --output_format json
|
|
||||||
```
|
|
||||||
|
|
||||||
Then process with screen content:
|
|
||||||
```bash
|
|
||||||
python process_meeting.py samples/meeting.mkv --transcript samples/meeting.json
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage Examples
|
## Usage Examples
|
||||||
|
|
||||||
|
### Run with different Whisper models
|
||||||
|
```bash
|
||||||
|
# Tiny model (fastest, less accurate)
|
||||||
|
python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model tiny
|
||||||
|
|
||||||
|
# Small model (balanced)
|
||||||
|
python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model small
|
||||||
|
|
||||||
|
# Large model (slowest, most accurate)
|
||||||
|
python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model large
|
||||||
|
```
|
||||||
|
|
||||||
### Extract frames at different intervals
|
### Extract frames at different intervals
|
||||||
```bash
|
```bash
|
||||||
# Every 10 seconds
|
# Every 10 seconds (with Whisper)
|
||||||
python process_meeting.py samples/meeting.mkv --interval 10
|
python process_meeting.py samples/meeting.mkv --run-whisper --interval 10
|
||||||
|
|
||||||
# Every 3 seconds (more detailed)
|
# Every 3 seconds (more detailed)
|
||||||
python process_meeting.py samples/meeting.mkv --interval 3
|
python process_meeting.py samples/meeting.mkv --run-whisper --interval 3
|
||||||
```
|
```
|
||||||
|
|
||||||
### Use scene detection (smarter, fewer frames)
|
### Use scene detection (smarter, fewer frames)
|
||||||
```bash
|
```bash
|
||||||
python process_meeting.py samples/meeting.mkv --scene-detection
|
python process_meeting.py samples/meeting.mkv --run-whisper --scene-detection
|
||||||
```
|
```
|
||||||
|
|
||||||
### Use different OCR engines
|
### Use different OCR engines
|
||||||
```bash
|
```bash
|
||||||
# EasyOCR (good for varied layouts)
|
# EasyOCR (good for varied layouts)
|
||||||
python process_meeting.py samples/meeting.mkv --ocr-engine easyocr
|
python process_meeting.py samples/meeting.mkv --run-whisper --ocr-engine easyocr
|
||||||
|
|
||||||
# PaddleOCR (good for code/terminal)
|
# PaddleOCR (good for code/terminal)
|
||||||
python process_meeting.py samples/meeting.mkv --ocr-engine paddleocr
|
python process_meeting.py samples/meeting.mkv --run-whisper --ocr-engine paddleocr
|
||||||
```
|
```
|
||||||
|
|
||||||
### Extract frames only (no merging)
|
### Extract frames only (no merging)
|
||||||
@@ -108,41 +129,48 @@ python process_meeting.py samples/meeting.mkv --extract-only
|
|||||||
|
|
||||||
### Custom output location
|
### Custom output location
|
||||||
```bash
|
```bash
|
||||||
python process_meeting.py samples/meeting.mkv --output my_meeting.txt --frames-dir my_frames/
|
python process_meeting.py samples/meeting.mkv --run-whisper --output-dir my_outputs/
|
||||||
```
|
```
|
||||||
|
|
||||||
### Enable verbose logging
|
### Enable verbose logging
|
||||||
```bash
|
```bash
|
||||||
# Show detailed debug information
|
# Show detailed debug information
|
||||||
python process_meeting.py samples/meeting.mkv --verbose
|
python process_meeting.py samples/meeting.mkv --run-whisper --verbose
|
||||||
|
|
||||||
# Short form
|
|
||||||
python process_meeting.py samples/meeting.mkv -v
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Output Files
|
## Output Files
|
||||||
|
|
||||||
After processing, you'll get:
|
All output files are saved to the `output/` directory by default:
|
||||||
|
|
||||||
- **`<video>_enhanced.txt`** - Enhanced transcript ready for Claude
|
- **`output/<video>_enhanced.txt`** - Enhanced transcript ready for Claude
|
||||||
- **`<video>_ocr.json`** - Raw OCR data with timestamps
|
- **`output/<video>.json`** - Whisper transcript (if `--run-whisper` was used)
|
||||||
|
- **`output/<video>_ocr.json`** - Raw OCR data with timestamps
|
||||||
- **`frames/`** - Extracted video frames (JPG files)
|
- **`frames/`** - Extracted video frames (JPG files)
|
||||||
|
|
||||||
## Workflow for Meeting Analysis
|
## Workflow for Meeting Analysis
|
||||||
|
|
||||||
### Complete Workflow
|
### Complete Workflow (One Command!)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1. Extract audio and transcribe with Whisper
|
# Process everything in one step
|
||||||
whisper samples/alo-intro1.mkv --model base --output_format json
|
python process_meeting.py samples/alo-intro1.mkv --run-whisper --scene-detection
|
||||||
|
|
||||||
|
# Output will be in output/alo-intro1_enhanced.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### Traditional Workflow (Separate Steps)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Extract audio and transcribe with Whisper (optional, if not using --run-whisper)
|
||||||
|
whisper samples/alo-intro1.mkv --model base --output_format json --output_dir output
|
||||||
|
|
||||||
# 2. Process video to extract screen content
|
# 2. Process video to extract screen content
|
||||||
python process_meeting.py samples/alo-intro1.mkv \
|
python process_meeting.py samples/alo-intro1.mkv \
|
||||||
--transcript samples/alo-intro1.json \
|
--transcript output/alo-intro1.json \
|
||||||
--scene-detection
|
--scene-detection
|
||||||
|
|
||||||
# 3. Use the enhanced transcript with Claude
|
# 3. Use the enhanced transcript with Claude
|
||||||
# Copy the content from alo-intro1_enhanced.txt and paste into Claude
|
# Copy the content from output/alo-intro1_enhanced.txt and paste into Claude
|
||||||
```
|
```
|
||||||
|
|
||||||
### Example Prompt for Claude
|
### Example Prompt for Claude
|
||||||
@@ -160,7 +188,9 @@ Please summarize this meeting transcript. Pay special attention to:
|
|||||||
## Command Reference
|
## Command Reference
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: process_meeting.py [-h] [--transcript TRANSCRIPT] [--output OUTPUT]
|
usage: process_meeting.py [-h] [--transcript TRANSCRIPT] [--run-whisper]
|
||||||
|
[--whisper-model {tiny,base,small,medium,large}]
|
||||||
|
[--output OUTPUT] [--output-dir OUTPUT_DIR]
|
||||||
[--frames-dir FRAMES_DIR] [--interval INTERVAL]
|
[--frames-dir FRAMES_DIR] [--interval INTERVAL]
|
||||||
[--scene-detection]
|
[--scene-detection]
|
||||||
[--ocr-engine {tesseract,easyocr,paddleocr}]
|
[--ocr-engine {tesseract,easyocr,paddleocr}]
|
||||||
@@ -171,7 +201,10 @@ usage: process_meeting.py [-h] [--transcript TRANSCRIPT] [--output OUTPUT]
|
|||||||
Options:
|
Options:
|
||||||
video Path to video file
|
video Path to video file
|
||||||
--transcript, -t Path to Whisper transcript (JSON or TXT)
|
--transcript, -t Path to Whisper transcript (JSON or TXT)
|
||||||
|
--run-whisper Run Whisper transcription before processing
|
||||||
|
--whisper-model Whisper model: tiny, base, small, medium, large (default: base)
|
||||||
--output, -o Output file for enhanced transcript
|
--output, -o Output file for enhanced transcript
|
||||||
|
--output-dir Directory for output files (default: output/)
|
||||||
--frames-dir Directory to save extracted frames (default: frames/)
|
--frames-dir Directory to save extracted frames (default: frames/)
|
||||||
--interval Extract frame every N seconds (default: 5)
|
--interval Extract frame every N seconds (default: 5)
|
||||||
--scene-detection Use scene detection instead of interval extraction
|
--scene-detection Use scene detection instead of interval extraction
|
||||||
|
|||||||
0
output/.gitkeep
Normal file
0
output/.gitkeep
Normal file
@@ -8,6 +8,8 @@ from pathlib import Path
|
|||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
|
||||||
from meetus.frame_extractor import FrameExtractor
|
from meetus.frame_extractor import FrameExtractor
|
||||||
from meetus.ocr_processor import OCRProcessor
|
from meetus.ocr_processor import OCRProcessor
|
||||||
@@ -38,23 +40,78 @@ def setup_logging(verbose: bool = False):
|
|||||||
logging.getLogger('paddleocr').setLevel(logging.WARNING)
|
logging.getLogger('paddleocr').setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
|
def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
|
||||||
|
"""
|
||||||
|
Run Whisper transcription on video file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
video_path: Path to video file
|
||||||
|
model: Whisper model to use (tiny, base, small, medium, large)
|
||||||
|
output_dir: Directory to save output
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to generated JSON transcript
|
||||||
|
"""
|
||||||
|
# Check if whisper is installed
|
||||||
|
if not shutil.which("whisper"):
|
||||||
|
logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
logger.info(f"Running Whisper transcription (model: {model})...")
|
||||||
|
logger.info("This may take a few minutes depending on video length...")
|
||||||
|
|
||||||
|
# Run whisper command
|
||||||
|
cmd = [
|
||||||
|
"whisper",
|
||||||
|
str(video_path),
|
||||||
|
"--model", model,
|
||||||
|
"--output_format", "json",
|
||||||
|
"--output_dir", output_dir
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
check=True,
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Whisper outputs to <output_dir>/<video_stem>.json
|
||||||
|
transcript_path = Path(output_dir) / f"{video_path.stem}.json"
|
||||||
|
|
||||||
|
if transcript_path.exists():
|
||||||
|
logger.info(f"✓ Whisper transcription completed: {transcript_path}")
|
||||||
|
return transcript_path
|
||||||
|
else:
|
||||||
|
logger.error("Whisper completed but transcript file not found")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
logger.error(f"Whisper failed: {e.stderr}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Extract screen content from meeting recordings and merge with transcripts",
|
description="Extract screen content from meeting recordings and merge with transcripts",
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
# Process video and extract frames only
|
# Run Whisper + full processing in one command
|
||||||
python process_meeting.py samples/meeting.mkv --extract-only
|
python process_meeting.py samples/meeting.mkv --run-whisper
|
||||||
|
|
||||||
# Process video with Whisper transcript
|
# Process video with existing Whisper transcript
|
||||||
python process_meeting.py samples/meeting.mkv --transcript meeting.json
|
python process_meeting.py samples/meeting.mkv --transcript output/meeting.json
|
||||||
|
|
||||||
# Use scene detection instead of interval
|
# Use scene detection instead of interval
|
||||||
python process_meeting.py samples/meeting.mkv --scene-detection
|
python process_meeting.py samples/meeting.mkv --run-whisper --scene-detection
|
||||||
|
|
||||||
# Use different OCR engine
|
# Use different Whisper model and OCR engine
|
||||||
python process_meeting.py samples/meeting.mkv --ocr-engine easyocr
|
python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model small --ocr-engine easyocr
|
||||||
|
|
||||||
|
# Extract frames only (no transcript)
|
||||||
|
python process_meeting.py samples/meeting.mkv --extract-only
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -69,12 +126,31 @@ Examples:
|
|||||||
default=None
|
default=None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--run-whisper',
|
||||||
|
action='store_true',
|
||||||
|
help='Run Whisper transcription before processing'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--whisper-model',
|
||||||
|
choices=['tiny', 'base', 'small', 'medium', 'large'],
|
||||||
|
help='Whisper model to use (default: base)',
|
||||||
|
default='base'
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output', '-o',
|
'--output', '-o',
|
||||||
help='Output file for enhanced transcript (default: <video>_enhanced.txt)',
|
help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
|
||||||
default=None
|
default=None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--output-dir',
|
||||||
|
help='Directory for output files (default: output/)',
|
||||||
|
default='output'
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--frames-dir',
|
'--frames-dir',
|
||||||
help='Directory to save extracted frames (default: frames/)',
|
help='Directory to save extracted frames (default: frames/)',
|
||||||
@@ -137,9 +213,22 @@ Examples:
|
|||||||
logger.error(f"Video file not found: {args.video}")
|
logger.error(f"Video file not found: {args.video}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Set default output path
|
# Set default output path
|
||||||
if args.output is None:
|
if args.output is None:
|
||||||
args.output = video_path.stem + '_enhanced.txt'
|
args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
|
||||||
|
|
||||||
|
# Run Whisper if requested
|
||||||
|
if args.run_whisper:
|
||||||
|
logger.info("=" * 80)
|
||||||
|
logger.info("STEP 0: Running Whisper Transcription")
|
||||||
|
logger.info("=" * 80)
|
||||||
|
transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
|
||||||
|
args.transcript = str(transcript_path)
|
||||||
|
logger.info("")
|
||||||
|
|
||||||
logger.info("=" * 80)
|
logger.info("=" * 80)
|
||||||
logger.info("MEETING PROCESSOR")
|
logger.info("MEETING PROCESSOR")
|
||||||
@@ -147,6 +236,8 @@ Examples:
|
|||||||
logger.info(f"Video: {video_path.name}")
|
logger.info(f"Video: {video_path.name}")
|
||||||
logger.info(f"OCR Engine: {args.ocr_engine}")
|
logger.info(f"OCR Engine: {args.ocr_engine}")
|
||||||
logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
|
logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
|
||||||
|
if args.transcript:
|
||||||
|
logger.info(f"Transcript: {args.transcript}")
|
||||||
logger.info("=" * 80)
|
logger.info("=" * 80)
|
||||||
|
|
||||||
# Step 1: Extract frames
|
# Step 1: Extract frames
|
||||||
@@ -181,7 +272,7 @@ Examples:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Save OCR results as JSON
|
# Save OCR results as JSON
|
||||||
ocr_output = video_path.stem + '_ocr.json'
|
ocr_output = output_dir / f"{video_path.stem}_ocr.json"
|
||||||
with open(ocr_output, 'w', encoding='utf-8') as f:
|
with open(ocr_output, 'w', encoding='utf-8') as f:
|
||||||
json.dump(screen_segments, f, indent=2, ensure_ascii=False)
|
json.dump(screen_segments, f, indent=2, ensure_ascii=False)
|
||||||
logger.info(f"✓ Saved OCR results to: {ocr_output}")
|
logger.info(f"✓ Saved OCR results to: {ocr_output}")
|
||||||
|
|||||||
0
samples/.gitkeep
Normal file
0
samples/.gitkeep
Normal file
Reference in New Issue
Block a user