add whisperx support

2025-12-02 02:33:39 -03:00
parent 118ef04223
commit 7b919beda6
4 changed files with 155 additions and 38 deletions
--- a/def/05-reference-frames-instead-of-embedding.md
+++ b/def/05-reference-frames-instead-of-embedding.md
@@ -0,0 +1,124 @@
 # 05 - Reference Frame Files Instead of Embedding
 ## Date
 2025-10-28
 ## Context
 Embedding base64 images made the enhanced transcript files very large (3.7MB for ~40 frames). This made them harder to work with and slower to process.
 ## Problem
 - Enhanced transcript with embedded base64 images was 3.7MB
 - Large file size makes it slow to read/process
 - Difficult to inspect individual frames
 - Harder to share and version control
 ## Solution: Reference Frame Paths
 Instead of embedding base64 image data, reference the frame files by their relative paths.
 ### Before (Embedded):
 ```
 [00:08] SCREEN CONTENT:
  IMAGE (base64, 85KB):
  <image>data:image/jpeg;base64,/9j/4AAQSkZJRg...</image>
 ```
 File size: 3.7MB
 ### After (Referenced):
 ```
 [00:08] SCREEN CONTENT:
  Frame: frames/zaca-run-scrapers_00257.jpg
 ```
 File size: ~50KB
 ## Implementation
 **Directory Structure:**
 ```
 output/20251028-003-zaca-run-scrapers/
 ├── frames/
 │   ├── zaca-run-scrapers_00257.jpg
 │   ├── zaca-run-scrapers_00487.jpg
 │   └── ...
 ├── zaca-run-scrapers.json (whisper transcript)
 └── zaca-run-scrapers_enhanced.txt (references frames/ directory)
 ```
 **Enhanced Transcript Format:**
 ```
 ================================================================================
 ENHANCED MEETING TRANSCRIPT
 Audio transcript + Screen frames
 ================================================================================
 [00:30] SPEAKER:
  Bueno, te dio un tour para el proyecto...
 [00:08] SCREEN CONTENT:
  Frame: frames/zaca-run-scrapers_00257.jpg
 [01:00] SPEAKER:
  Mayormente en Scrapping lo que tenemos...
 [01:15] SCREEN CONTENT:
  Frame: frames/zaca-run-scrapers_00487.jpg
  TEXT:
  | Code snippet from screen (if OCR was used)
 ```
 ## Benefits
 ✓ **Much smaller files**: ~50KB vs 3.7MB (74x smaller!)
 ✓ **Easier to inspect**: Can view individual frames directly
 ✓ **LLM can access images**: Frame paths allow LLM to load images on demand
 ✓ **Better version control**: Text files are small and diffable
 ✓ **Cleaner structure**: Frames organized in dedicated directory
 ✓ **Flexible**: Can still do OCR/vision analysis if needed (adds TEXT section)
 ## Flags
 **`--embed-images`**: Skip OCR/vision analysis, just reference frame files
 - Faster (no analysis needed)
 - Lets LLM analyze raw images
 - Enhanced transcript only contains frame references
 **Without `--embed-images`**: Run OCR/vision analysis
 - Extracts text from frames
 - Enhanced transcript includes both frame reference AND extracted text
 - Useful for code/dashboard analysis
 ## Usage
 ```bash
 # Reference frames only (no OCR, faster)
 python process_meeting.py samples/video.mkv --run-whisper --embed-images --scene-detection -v
 # Reference frames + OCR text extraction
 python process_meeting.py samples/video.mkv --run-whisper --use-hybrid --scene-detection -v
 # Adjust frame quality (smaller files)
 python process_meeting.py samples/video.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection -v
 ```
 ## Files Modified
 - `meetus/transcript_merger.py` - Modified `_format_detailed()` to output frame paths instead of base64
 - `process_meeting.py` - Updated help text and examples to reflect frame referencing
 - All processors (OCR, vision, hybrid) already include `frame_path` in results (no changes needed)
 ## Workflow Example
 ```bash
 # First run: Generate everything
 python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection -v
 # Result:
 # - output/20251028-004-meeting/
 #   - frames/ (40 frames, ~80KB each)
 #   - meeting.json (whisper transcript)
 #   - meeting_enhanced.txt (~50KB, references frames/)
 # LLM can now:
 # 1. Read enhanced transcript
 # 2. See timeline of audio + screen changes
 # 3. Load individual frames as needed from frames/ directory
 ```
--- a/meetus/transcript_merger.py
+++ b/meetus/transcript_merger.py
@@ -45,14 +45,15 @@ class TranscriptMerger:
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)
-            # Handle different Whisper output formats
+            # Handle different Whisper/WhisperX output formats
            segments = []
            if isinstance(data, dict) and 'segments' in data:
-                # Standard Whisper JSON format
+                # Standard Whisper/WhisperX JSON format
                segments = [
                    {
                        'timestamp': seg.get('start', 0),
                        'text': seg['text'].strip(),
                        'speaker': seg.get('speaker'),  # WhisperX diarization
                        'type': 'audio'
                    }
                    for seg in data['segments']
@@ -63,6 +64,7 @@ class TranscriptMerger:
                    {
                        'timestamp': seg.get('start', seg.get('timestamp', 0)),
                        'text': seg['text'].strip(),
                        'speaker': seg.get('speaker'),  # WhisperX diarization
                        'type': 'audio'
                    }
                    for seg in data
@@ -207,35 +209,28 @@ class TranscriptMerger:
        lines = []
        lines.append("=" * 80)
        lines.append("ENHANCED MEETING TRANSCRIPT")
-        if self.embed_images:
+        lines.append("Audio transcript + Screen frames")
            lines.append("Audio transcript + Embedded frame images (base64)")
        else:
            lines.append("Audio transcript + Screen content")
        lines.append("=" * 80)
        lines.append("")
        total_image_bytes = 0
        for seg in segments:
            timestamp = self._format_timestamp(seg['timestamp'])
            if seg['type'] == 'audio':
-                lines.append(f"[{timestamp}] SPEAKER:")
+                speaker = seg.get('speaker', 'SPEAKER')
                lines.append(f"[{timestamp}] {speaker}:")
                lines.append(f"  {seg['text']}")
                lines.append("")
            else:  # screen
                lines.append(f"[{timestamp}] SCREEN CONTENT:")
-                # Embed image if requested
+                # Show frame path if available
-                if self.embed_images and 'frame_path' in seg:
+                if 'frame_path' in seg:
-                    b64_img, img_size = self._encode_image_base64(seg['frame_path'])
+                    # Get just the filename relative to the enhanced transcript
-                    total_image_bytes += img_size
+                    frame_path = Path(seg['frame_path'])
-
+                    relative_path = f"frames/{frame_path.name}"
-                    if b64_img:
+                    lines.append(f"  Frame: {relative_path}")
                        lines.append(f"  IMAGE (base64, {img_size // 1024}KB):")
                        lines.append(f"  <image>data:image/jpeg;base64,{b64_img}</image>")
                        lines.append("")
                # Include text content if available (fallback or additional context)
                if 'text' in seg and seg['text'].strip():
@@ -245,12 +240,6 @@ class TranscriptMerger:
                lines.append("")
        if self.embed_images and total_image_bytes > 0:
            total_mb = total_image_bytes / (1024 * 1024)
            lines.append("")
            lines.append(f"Total embedded images size: {total_mb:.2f} MB")
            logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB")
        return "\n".join(lines)
    def _format_compact(self, segments: List[Dict]) -> str:
@@ -259,7 +248,10 @@ class TranscriptMerger:
        for seg in segments:
            timestamp = self._format_timestamp(seg['timestamp'])
-            prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN"
+            if seg['type'] == 'audio':
                prefix = seg.get('speaker', 'SPEAKER')
            else:
                prefix = "SCREEN"
            text = seg['text'].replace('\n', ' ')[:200]  # Truncate long screen text
            lines.append(f"[{timestamp}] {prefix}: {text}")
--- a/meetus/workflow.py
+++ b/meetus/workflow.py
@@ -184,10 +184,10 @@ class ProcessingWorkflow:
        logger.info("STEP 0: Running Whisper Transcription")
        logger.info("=" * 80)
-        # Check if whisper is installed
+        # Check if whisperx is installed
-        if not shutil.which("whisper"):
+        if not shutil.which("whisperx"):
-            logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
+            logger.error("WhisperX is not installed. Install it with: pip install whisperx")
-            raise RuntimeError("Whisper not installed")
+            raise RuntimeError("WhisperX not installed")
        # Unload Ollama model to free GPU memory for Whisper (if using vision)
        if self.config.use_vision:
@@ -199,16 +199,17 @@ class ProcessingWorkflow:
            except Exception as e:
                logger.warning(f"Could not unload Ollama model: {e}")
-        logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
+        logger.info(f"Running WhisperX transcription with diarization (model: {self.config.whisper_model})...")
        logger.info("This may take a few minutes depending on video length...")
-        # Run whisper command
+        # Run whisperx command with diarization
        cmd = [
-            "whisper",
+            "whisperx",
            str(self.config.video_path),
            "--model", self.config.whisper_model,
            "--output_format", "json",
-            "--output_dir", str(self.output_mgr.output_dir)
+            "--output_dir", str(self.output_mgr.output_dir),
            "--diarize",
        ]
        try:
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -32,13 +32,13 @@ def main():
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
-  # Embed images for LLM analysis (recommended - let LLM analyze actual frames)
+  # Reference frames for LLM analysis (recommended - transcript includes frame paths)
  python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection
-  # Embed with custom quality (lower = smaller file size)
+  # Adjust frame extraction quality (lower = smaller files)
  python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection
-  # Hybrid approach: OpenCV + OCR (extracts text, no images)
+  # Hybrid approach: OpenCV + OCR (extracts text from frames)
  python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection
  # Hybrid + LLM cleanup (best for code formatting)
@@ -183,12 +183,12 @@ Examples:
    parser.add_argument(
        '--embed-images',
        action='store_true',
-        help='Embed frame images (as base64) in enhanced transcript for LLM analysis'
+        help='Skip OCR/vision analysis and reference frame files directly (faster, lets LLM analyze images)'
    )
    parser.add_argument(
        '--embed-quality',
        type=int,
-        help='JPEG quality for embedded images (default: 80, lower = smaller file)',
+        help='JPEG quality for extracted frames (default: 80, lower = smaller files)',
        default=80
    )