add whisperx support

This commit is contained in:
Mariano Gabriel
2025-12-02 02:33:39 -03:00
parent 118ef04223
commit 7b919beda6
4 changed files with 155 additions and 38 deletions

View File

@@ -0,0 +1,124 @@
# 05 - Reference Frame Files Instead of Embedding
## Date
2025-10-28
## Context
Embedding base64 images made the enhanced transcript files very large (3.7MB for ~40 frames). This made them harder to work with and slower to process.
## Problem
- Enhanced transcript with embedded base64 images was 3.7MB
- Large file size makes it slow to read/process
- Difficult to inspect individual frames
- Harder to share and version control
## Solution: Reference Frame Paths
Instead of embedding base64 image data, reference the frame files by their relative paths.
### Before (Embedded):
```
[00:08] SCREEN CONTENT:
IMAGE (base64, 85KB):
<image>data:image/jpeg;base64,/9j/4AAQSkZJRg...</image>
```
File size: 3.7MB
### After (Referenced):
```
[00:08] SCREEN CONTENT:
Frame: frames/zaca-run-scrapers_00257.jpg
```
File size: ~50KB
## Implementation
**Directory Structure:**
```
output/20251028-003-zaca-run-scrapers/
├── frames/
│ ├── zaca-run-scrapers_00257.jpg
│ ├── zaca-run-scrapers_00487.jpg
│ └── ...
├── zaca-run-scrapers.json (whisper transcript)
└── zaca-run-scrapers_enhanced.txt (references frames/ directory)
```
**Enhanced Transcript Format:**
```
================================================================================
ENHANCED MEETING TRANSCRIPT
Audio transcript + Screen frames
================================================================================
[00:30] SPEAKER:
Bueno, te dio un tour para el proyecto...
[00:08] SCREEN CONTENT:
Frame: frames/zaca-run-scrapers_00257.jpg
[01:00] SPEAKER:
Mayormente en Scrapping lo que tenemos...
[01:15] SCREEN CONTENT:
Frame: frames/zaca-run-scrapers_00487.jpg
TEXT:
| Code snippet from screen (if OCR was used)
```
## Benefits
**Much smaller files**: ~50KB vs 3.7MB (74x smaller!)
**Easier to inspect**: Can view individual frames directly
**LLM can access images**: Frame paths allow LLM to load images on demand
**Better version control**: Text files are small and diffable
**Cleaner structure**: Frames organized in dedicated directory
**Flexible**: Can still do OCR/vision analysis if needed (adds TEXT section)
## Flags
**`--embed-images`**: Skip OCR/vision analysis, just reference frame files
- Faster (no analysis needed)
- Lets LLM analyze raw images
- Enhanced transcript only contains frame references
**Without `--embed-images`**: Run OCR/vision analysis
- Extracts text from frames
- Enhanced transcript includes both frame reference AND extracted text
- Useful for code/dashboard analysis
## Usage
```bash
# Reference frames only (no OCR, faster)
python process_meeting.py samples/video.mkv --run-whisper --embed-images --scene-detection -v
# Reference frames + OCR text extraction
python process_meeting.py samples/video.mkv --run-whisper --use-hybrid --scene-detection -v
# Adjust frame quality (smaller files)
python process_meeting.py samples/video.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection -v
```
## Files Modified
- `meetus/transcript_merger.py` - Modified `_format_detailed()` to output frame paths instead of base64
- `process_meeting.py` - Updated help text and examples to reflect frame referencing
- All processors (OCR, vision, hybrid) already include `frame_path` in results (no changes needed)
## Workflow Example
```bash
# First run: Generate everything
python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection -v
# Result:
# - output/20251028-004-meeting/
# - frames/ (40 frames, ~80KB each)
# - meeting.json (whisper transcript)
# - meeting_enhanced.txt (~50KB, references frames/)
# LLM can now:
# 1. Read enhanced transcript
# 2. See timeline of audio + screen changes
# 3. Load individual frames as needed from frames/ directory
```

View File

@@ -45,14 +45,15 @@ class TranscriptMerger:
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
data = json.load(f) data = json.load(f)
# Handle different Whisper output formats # Handle different Whisper/WhisperX output formats
segments = [] segments = []
if isinstance(data, dict) and 'segments' in data: if isinstance(data, dict) and 'segments' in data:
# Standard Whisper JSON format # Standard Whisper/WhisperX JSON format
segments = [ segments = [
{ {
'timestamp': seg.get('start', 0), 'timestamp': seg.get('start', 0),
'text': seg['text'].strip(), 'text': seg['text'].strip(),
'speaker': seg.get('speaker'), # WhisperX diarization
'type': 'audio' 'type': 'audio'
} }
for seg in data['segments'] for seg in data['segments']
@@ -63,6 +64,7 @@ class TranscriptMerger:
{ {
'timestamp': seg.get('start', seg.get('timestamp', 0)), 'timestamp': seg.get('start', seg.get('timestamp', 0)),
'text': seg['text'].strip(), 'text': seg['text'].strip(),
'speaker': seg.get('speaker'), # WhisperX diarization
'type': 'audio' 'type': 'audio'
} }
for seg in data for seg in data
@@ -207,35 +209,28 @@ class TranscriptMerger:
lines = [] lines = []
lines.append("=" * 80) lines.append("=" * 80)
lines.append("ENHANCED MEETING TRANSCRIPT") lines.append("ENHANCED MEETING TRANSCRIPT")
if self.embed_images: lines.append("Audio transcript + Screen frames")
lines.append("Audio transcript + Embedded frame images (base64)")
else:
lines.append("Audio transcript + Screen content")
lines.append("=" * 80) lines.append("=" * 80)
lines.append("") lines.append("")
total_image_bytes = 0
for seg in segments: for seg in segments:
timestamp = self._format_timestamp(seg['timestamp']) timestamp = self._format_timestamp(seg['timestamp'])
if seg['type'] == 'audio': if seg['type'] == 'audio':
lines.append(f"[{timestamp}] SPEAKER:") speaker = seg.get('speaker', 'SPEAKER')
lines.append(f"[{timestamp}] {speaker}:")
lines.append(f" {seg['text']}") lines.append(f" {seg['text']}")
lines.append("") lines.append("")
else: # screen else: # screen
lines.append(f"[{timestamp}] SCREEN CONTENT:") lines.append(f"[{timestamp}] SCREEN CONTENT:")
# Embed image if requested # Show frame path if available
if self.embed_images and 'frame_path' in seg: if 'frame_path' in seg:
b64_img, img_size = self._encode_image_base64(seg['frame_path']) # Get just the filename relative to the enhanced transcript
total_image_bytes += img_size frame_path = Path(seg['frame_path'])
relative_path = f"frames/{frame_path.name}"
if b64_img: lines.append(f" Frame: {relative_path}")
lines.append(f" IMAGE (base64, {img_size // 1024}KB):")
lines.append(f" <image>data:image/jpeg;base64,{b64_img}</image>")
lines.append("")
# Include text content if available (fallback or additional context) # Include text content if available (fallback or additional context)
if 'text' in seg and seg['text'].strip(): if 'text' in seg and seg['text'].strip():
@@ -245,12 +240,6 @@ class TranscriptMerger:
lines.append("") lines.append("")
if self.embed_images and total_image_bytes > 0:
total_mb = total_image_bytes / (1024 * 1024)
lines.append("")
lines.append(f"Total embedded images size: {total_mb:.2f} MB")
logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB")
return "\n".join(lines) return "\n".join(lines)
def _format_compact(self, segments: List[Dict]) -> str: def _format_compact(self, segments: List[Dict]) -> str:
@@ -259,7 +248,10 @@ class TranscriptMerger:
for seg in segments: for seg in segments:
timestamp = self._format_timestamp(seg['timestamp']) timestamp = self._format_timestamp(seg['timestamp'])
prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN" if seg['type'] == 'audio':
prefix = seg.get('speaker', 'SPEAKER')
else:
prefix = "SCREEN"
text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text
lines.append(f"[{timestamp}] {prefix}: {text}") lines.append(f"[{timestamp}] {prefix}: {text}")

View File

@@ -184,10 +184,10 @@ class ProcessingWorkflow:
logger.info("STEP 0: Running Whisper Transcription") logger.info("STEP 0: Running Whisper Transcription")
logger.info("=" * 80) logger.info("=" * 80)
# Check if whisper is installed # Check if whisperx is installed
if not shutil.which("whisper"): if not shutil.which("whisperx"):
logger.error("Whisper is not installed. Install it with: pip install openai-whisper") logger.error("WhisperX is not installed. Install it with: pip install whisperx")
raise RuntimeError("Whisper not installed") raise RuntimeError("WhisperX not installed")
# Unload Ollama model to free GPU memory for Whisper (if using vision) # Unload Ollama model to free GPU memory for Whisper (if using vision)
if self.config.use_vision: if self.config.use_vision:
@@ -199,16 +199,17 @@ class ProcessingWorkflow:
except Exception as e: except Exception as e:
logger.warning(f"Could not unload Ollama model: {e}") logger.warning(f"Could not unload Ollama model: {e}")
logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...") logger.info(f"Running WhisperX transcription with diarization (model: {self.config.whisper_model})...")
logger.info("This may take a few minutes depending on video length...") logger.info("This may take a few minutes depending on video length...")
# Run whisper command # Run whisperx command with diarization
cmd = [ cmd = [
"whisper", "whisperx",
str(self.config.video_path), str(self.config.video_path),
"--model", self.config.whisper_model, "--model", self.config.whisper_model,
"--output_format", "json", "--output_format", "json",
"--output_dir", str(self.output_mgr.output_dir) "--output_dir", str(self.output_mgr.output_dir),
"--diarize",
] ]
try: try:

View File

@@ -32,13 +32,13 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
# Embed images for LLM analysis (recommended - let LLM analyze actual frames) # Reference frames for LLM analysis (recommended - transcript includes frame paths)
python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection
# Embed with custom quality (lower = smaller file size) # Adjust frame extraction quality (lower = smaller files)
python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection
# Hybrid approach: OpenCV + OCR (extracts text, no images) # Hybrid approach: OpenCV + OCR (extracts text from frames)
python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection
# Hybrid + LLM cleanup (best for code formatting) # Hybrid + LLM cleanup (best for code formatting)
@@ -183,12 +183,12 @@ Examples:
parser.add_argument( parser.add_argument(
'--embed-images', '--embed-images',
action='store_true', action='store_true',
help='Embed frame images (as base64) in enhanced transcript for LLM analysis' help='Skip OCR/vision analysis and reference frame files directly (faster, lets LLM analyze images)'
) )
parser.add_argument( parser.add_argument(
'--embed-quality', '--embed-quality',
type=int, type=int,
help='JPEG quality for embedded images (default: 80, lower = smaller file)', help='JPEG quality for extracted frames (default: 80, lower = smaller files)',
default=80 default=80
) )