From c871af2def6b5b495989127c51ff4757c56a4877 Mon Sep 17 00:00:00 2001
From: Mariano Gabriel <pensalo@gmail.com>
Date: Thu, 23 Oct 2025 14:49:14 -0300
Subject: [PATCH] group text

---
 meetus/transcript_merger.py | 58 +++++++++++++++++++++++++++++++++++--
 meetus/vision_processor.py  | 38 ++++++++++++++++++++++--
 meetus/workflow.py          | 23 +++++++++++++--
 3 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/meetus/transcript_merger.py b/meetus/transcript_merger.py
index 25770a2..14e99c5 100644
--- a/meetus/transcript_merger.py
+++ b/meetus/transcript_merger.py
@@ -17,7 +17,7 @@ class TranscriptMerger:
         """Initialize transcript merger."""
         pass
 
-    def load_whisper_transcript(self, transcript_path: str) -> List[Dict]:
+    def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]:
         """
         Load Whisper transcript from file.
 
@@ -25,6 +25,7 @@ class TranscriptMerger:
 
         Args:
             transcript_path: Path to transcript file
+            group_interval: If specified, group audio segments into intervals (in seconds)
 
         Returns:
             List of dicts with 'timestamp' (optional) and 'text'
@@ -36,9 +37,10 @@ class TranscriptMerger:
                 data = json.load(f)
 
             # Handle different Whisper output formats
+            segments = []
             if isinstance(data, dict) and 'segments' in data:
                 # Standard Whisper JSON format
-                return [
+                segments = [
                     {
                         'timestamp': seg.get('start', 0),
                         'text': seg['text'].strip(),
@@ -48,7 +50,7 @@ class TranscriptMerger:
                 ]
             elif isinstance(data, list):
                 # List of segments
-                return [
+                segments = [
                     {
                         'timestamp': seg.get('start', seg.get('timestamp', 0)),
                         'text': seg['text'].strip(),
@@ -57,6 +59,12 @@ class TranscriptMerger:
                     for seg in data
                 ]
 
+            # Group by interval if requested
+            if group_interval and segments:
+                segments = self.group_audio_by_intervals(segments, group_interval)
+
+            return segments
+
         else:
             # Plain text file - no timestamps
             with open(path, 'r', encoding='utf-8') as f:
@@ -68,6 +76,50 @@ class TranscriptMerger:
                 'type': 'audio'
             }]
 
+    def group_audio_by_intervals(self, segments: List[Dict], interval_seconds: int = 30) -> List[Dict]:
+        """
+        Group audio segments into regular time intervals.
+
+        Instead of word-level timestamps, this creates intervals (e.g., every 30 seconds)
+        with all text spoken during that interval concatenated together.
+
+        Args:
+            segments: List of audio segments with timestamps
+            interval_seconds: Duration of each interval in seconds
+
+        Returns:
+            List of grouped segments with interval timestamps
+        """
+        if not segments:
+            return []
+
+        # Find the max timestamp to determine how many intervals we need
+        max_timestamp = max(seg['timestamp'] for seg in segments)
+        num_intervals = int(max_timestamp / interval_seconds) + 1
+
+        # Create interval buckets
+        intervals = []
+        for i in range(num_intervals):
+            interval_start = i * interval_seconds
+            interval_end = (i + 1) * interval_seconds
+
+            # Collect all text in this interval
+            texts = []
+            for seg in segments:
+                if interval_start <= seg['timestamp'] < interval_end:
+                    texts.append(seg['text'])
+
+            # Only create interval if there's text
+            if texts:
+                intervals.append({
+                    'timestamp': interval_start,
+                    'text': ' '.join(texts),
+                    'type': 'audio'
+                })
+
+        logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s")
+        return intervals
+
     def merge_transcripts(
         self,
         audio_segments: List[Dict],
diff --git a/meetus/vision_processor.py b/meetus/vision_processor.py
index 9b4b4ca..ff42a43 100644
--- a/meetus/vision_processor.py
+++ b/meetus/vision_processor.py
@@ -83,13 +83,14 @@ class VisionProcessor:
             logger.warning(f"Prompt file not found: {prompt_file}, using default")
             return "Analyze this image and describe what you see in detail."
 
-    def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
+    def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str:
         """
         Analyze a single frame using local vision model.
 
         Args:
             image_path: Path to image file
             context: Context hint for analysis (meeting, dashboard, code, console)
+            audio_context: Optional audio transcript around this timestamp for context
 
         Returns:
             Analyzed content description
@@ -97,6 +98,10 @@ class VisionProcessor:
         # Load prompt from file
         prompt = self._load_prompt(context)
 
+        # Add audio context if available
+        if audio_context:
+            prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}"
+
         try:
             # Use Ollama's chat API with vision
             response = self._client.chat(
@@ -123,7 +128,8 @@ class VisionProcessor:
         frames_info: List[Tuple[str, float]],
         context: str = "meeting",
         deduplicate: bool = True,
-        similarity_threshold: float = 0.85
+        similarity_threshold: float = 0.85,
+        audio_segments: Optional[List[Dict]] = None
     ) -> List[Dict]:
         """
         Process multiple frames with vision analysis.
@@ -146,7 +152,10 @@ class VisionProcessor:
         for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
             logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
 
-            text = self.analyze_frame(frame_path, context)
+            # Get audio context around this timestamp (±30 seconds)
+            audio_context = self._get_audio_context(timestamp, audio_segments, window=30)
+
+            text = self.analyze_frame(frame_path, context, audio_context)
 
             if not text:
                 logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
@@ -170,6 +179,29 @@ class VisionProcessor:
         logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
         return results
 
+    def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str:
+        """
+        Get audio transcript around a given timestamp.
+
+        Args:
+            timestamp: Target timestamp in seconds
+            audio_segments: List of audio segments with 'timestamp' and 'text' keys
+            window: Time window in seconds (±window around timestamp)
+
+        Returns:
+            Concatenated audio text from the time window
+        """
+        if not audio_segments:
+            return ""
+
+        relevant = [seg for seg in audio_segments
+                    if abs(seg.get('timestamp', 0) - timestamp) <= window]
+
+        if not relevant:
+            return ""
+
+        return " ".join([seg['text'] for seg in relevant])
+
     def _text_similarity(self, text1: str, text2: str) -> float:
         """
         Calculate similarity between two texts.
diff --git a/meetus/workflow.py b/meetus/workflow.py
index acbae68..b7bb854 100644
--- a/meetus/workflow.py
+++ b/meetus/workflow.py
@@ -236,12 +236,25 @@ class ProcessingWorkflow:
         logger.info("Step 2: Running vision analysis on extracted frames...")
         logger.info(f"Loading vision model {self.config.vision_model} to GPU...")
 
+        # Load audio segments for context if transcript exists
+        audio_segments = []
+        transcript_path = self.config.transcript_path or self._get_cached_transcript()
+
+        if transcript_path:
+            transcript_file = Path(transcript_path)
+            if transcript_file.exists():
+                logger.info("Loading audio transcript for context...")
+                merger = TranscriptMerger()
+                audio_segments = merger.load_whisper_transcript(str(transcript_file))
+                logger.info(f"✓ Loaded {len(audio_segments)} audio segments for context")
+
         try:
             vision = VisionProcessor(model=self.config.vision_model)
             screen_segments = vision.process_frames(
                 frames_info,
                 context=self.config.vision_context,
-                deduplicate=not self.config.no_deduplicate
+                deduplicate=not self.config.no_deduplicate,
+                audio_segments=audio_segments
             )
             logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
 
@@ -253,6 +266,11 @@ class ProcessingWorkflow:
             logger.error(f"{e}")
             raise
 
+    def _get_cached_transcript(self) -> Optional[str]:
+        """Get cached Whisper transcript if available."""
+        cached = self.cache_mgr.get_whisper_cache()
+        return str(cached) if cached else None
+
     def _run_ocr_analysis(self, frames_info):
         """Run OCR analysis on frames."""
         logger.info("Step 2: Running OCR on extracted frames...")
@@ -289,7 +307,8 @@ class ProcessingWorkflow:
                 logger.warning(f"Transcript not found: {transcript_path}")
                 logger.info("Proceeding with screen content only...")
             else:
-                audio_segments = merger.load_whisper_transcript(str(transcript_file))
+                # Group audio into 30-second intervals for cleaner reference timestamps
+                audio_segments = merger.load_whisper_transcript(str(transcript_file), group_interval=30)
                 logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
         else:
             logger.info("No transcript provided, using screen content only...")