From 7d7ec15ff786cfbead141ccf39cf9590d03da07c Mon Sep 17 00:00:00 2001
From: Mariano Gabriel <pensalo@gmail.com>
Date: Wed, 3 Dec 2025 06:48:45 -0300
Subject: [PATCH] add whisperx support

---
 meetus/cache_manager.py     |  2 +-
 meetus/transcript_merger.py | 50 ++++++++++++++++++++++++++++++++++---
 meetus/workflow.py          | 46 ++++++++++++++++++++++++++--------
 process_meeting.py          |  5 ++++
 4 files changed, 87 insertions(+), 16 deletions(-)

diff --git a/meetus/cache_manager.py b/meetus/cache_manager.py
index 0f5cfb6..b027398 100644
--- a/meetus/cache_manager.py
+++ b/meetus/cache_manager.py
@@ -73,7 +73,7 @@ class CacheManager:
         if not self.use_cache or self.skip_cache_frames or not self.frames_dir.exists():
             return None
 
-        existing_frames = list(self.frames_dir.glob("frame_*.jpg"))
+        existing_frames = list(self.frames_dir.glob("*.jpg"))
 
         if not existing_frames:
             return None
diff --git a/meetus/transcript_merger.py b/meetus/transcript_merger.py
index dda4ff3..a255044 100644
--- a/meetus/transcript_merger.py
+++ b/meetus/transcript_merger.py
@@ -70,8 +70,10 @@ class TranscriptMerger:
                     for seg in data
                 ]
 
-            # Group by interval if requested
-            if group_interval and segments:
+            # Group by interval if requested, but skip if we have speaker diarization
+            # (merge_transcripts will group by speaker instead)
+            has_speakers = any(seg.get('speaker') for seg in segments)
+            if group_interval and segments and not has_speakers:
                 segments = self.group_audio_by_intervals(segments, group_interval)
 
             return segments
@@ -164,13 +166,14 @@ class TranscriptMerger:
     ) -> List[Dict]:
         """
         Merge audio and screen transcripts by timestamp.
+        Groups consecutive audio from same speaker until a screen frame interrupts.
 
         Args:
             audio_segments: List of audio transcript segments
             screen_segments: List of screen OCR segments
 
         Returns:
-            Merged list sorted by timestamp
+            Merged list sorted by timestamp, with audio grouped by speaker
         """
         # Mark segment types
         for seg in audio_segments:
@@ -182,7 +185,46 @@ class TranscriptMerger:
         all_segments = audio_segments + screen_segments
         all_segments.sort(key=lambda x: x['timestamp'])
 
-        return all_segments
+        # Group consecutive audio segments by speaker (screen frames break groups)
+        grouped = []
+        current_group = None
+
+        for seg in all_segments:
+            if seg['type'] == 'screen':
+                # Screen frame: flush current group and add frame
+                if current_group:
+                    grouped.append(current_group)
+                    current_group = None
+                grouped.append(seg)
+            else:
+                # Audio segment
+                speaker = seg.get('speaker')
+                if current_group is None:
+                    # Start new group
+                    current_group = {
+                        'timestamp': seg['timestamp'],
+                        'text': seg['text'],
+                        'speaker': speaker,
+                        'type': 'audio'
+                    }
+                elif speaker == current_group.get('speaker'):
+                    # Same speaker, append text
+                    current_group['text'] += ' ' + seg['text']
+                else:
+                    # Speaker changed, flush and start new group
+                    grouped.append(current_group)
+                    current_group = {
+                        'timestamp': seg['timestamp'],
+                        'text': seg['text'],
+                        'speaker': speaker,
+                        'type': 'audio'
+                    }
+
+        # Don't forget last group
+        if current_group:
+            grouped.append(current_group)
+
+        return grouped
 
     def format_for_claude(
         self,
diff --git a/meetus/workflow.py b/meetus/workflow.py
index 3c2b0be..17e6038 100644
--- a/meetus/workflow.py
+++ b/meetus/workflow.py
@@ -4,6 +4,7 @@ Coordinates frame extraction, analysis, and transcript merging.
 """
 from pathlib import Path
 import logging
+import os
 import subprocess
 import shutil
 from typing import Dict, Any, Optional
@@ -32,6 +33,7 @@ class WorkflowConfig:
         # Whisper options
         self.run_whisper = kwargs.get('run_whisper', False)
         self.whisper_model = kwargs.get('whisper_model', 'medium')
+        self.diarize = kwargs.get('diarize', False)
 
         # Frame extraction
         self.scene_detection = kwargs.get('scene_detection', False)
@@ -176,18 +178,27 @@ class ProcessingWorkflow:
         if cached:
             return str(cached)
 
-        # If no cache and not running whisper, use provided transcript path (if any)
-        if not self.config.run_whisper:
+        # If no cache and not running whisper/diarize, use provided transcript path (if any)
+        if not self.config.run_whisper and not self.config.diarize:
             return self.config.transcript_path
 
         logger.info("=" * 80)
         logger.info("STEP 0: Running Whisper Transcription")
         logger.info("=" * 80)
 
-        # Check if whisperx is installed
-        if not shutil.which("whisperx"):
-            logger.error("WhisperX is not installed. Install it with: pip install whisperx")
-            raise RuntimeError("WhisperX not installed")
+        # Determine which transcription tool to use
+        use_diarize = getattr(self.config, 'diarize', False)
+
+        if use_diarize:
+            if not shutil.which("whisperx"):
+                logger.error("WhisperX is not installed. Install it with: pip install whisperx")
+                raise RuntimeError("WhisperX not installed (required for --diarize)")
+            transcribe_cmd = "whisperx"
+        else:
+            if not shutil.which("whisper"):
+                logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
+                raise RuntimeError("Whisper not installed")
+            transcribe_cmd = "whisper"
 
         # Unload Ollama model to free GPU memory for Whisper (if using vision)
         if self.config.use_vision:
@@ -199,21 +210,34 @@ class ProcessingWorkflow:
             except Exception as e:
                 logger.warning(f"Could not unload Ollama model: {e}")
 
-        logger.info(f"Running WhisperX transcription with diarization (model: {self.config.whisper_model})...")
+        if use_diarize:
+            logger.info(f"Running WhisperX transcription with diarization (model: {self.config.whisper_model})...")
+        else:
+            logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
         logger.info("This may take a few minutes depending on video length...")
 
-        # Run whisperx command with diarization
+        # Build command
         cmd = [
-            "whisperx",
+            transcribe_cmd,
             str(self.config.video_path),
             "--model", self.config.whisper_model,
             "--output_format", "json",
             "--output_dir", str(self.output_mgr.output_dir),
-            "--diarize",
         ]
+        if use_diarize:
+            cmd.append("--diarize")
 
         try:
-            subprocess.run(cmd, check=True, capture_output=True, text=True)
+            # Set up environment with cuDNN library path for whisperx
+            env = os.environ.copy()
+            if use_diarize:
+                import site
+                site_packages = site.getsitepackages()[0]
+                cudnn_path = Path(site_packages) / "nvidia" / "cudnn" / "lib"
+                if cudnn_path.exists():
+                    env["LD_LIBRARY_PATH"] = str(cudnn_path) + ":" + env.get("LD_LIBRARY_PATH", "")
+
+            subprocess.run(cmd, check=True, capture_output=True, text=True, env=env)
 
             transcript_path = self.output_mgr.get_path(f"{self.config.video_path.stem}.json")
 
diff --git a/process_meeting.py b/process_meeting.py
index 1e9d526..0dd0446 100644
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -72,6 +72,11 @@ Examples:
         help='Whisper model to use (default: medium)',
         default='medium'
     )
+    parser.add_argument(
+        '--diarize',
+        action='store_true',
+        help='Use WhisperX with speaker diarization (requires whisperx and HuggingFace token)'
+    )
 
     # Output options
     parser.add_argument(