group text

2025-10-23 14:49:14 -03:00
parent cdf7ad1199
commit c871af2def
3 changed files with 111 additions and 8 deletions
--- a/meetus/vision_processor.py
+++ b/meetus/vision_processor.py
@@ -83,13 +83,14 @@ class VisionProcessor:
            logger.warning(f"Prompt file not found: {prompt_file}, using default")
            return "Analyze this image and describe what you see in detail."

-    def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
+    def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str:
        """
        Analyze a single frame using local vision model.

        Args:
            image_path: Path to image file
            context: Context hint for analysis (meeting, dashboard, code, console)
+            audio_context: Optional audio transcript around this timestamp for context

        Returns:
            Analyzed content description
@@ -97,6 +98,10 @@ class VisionProcessor:
        # Load prompt from file
        prompt = self._load_prompt(context)

+        # Add audio context if available
+        if audio_context:
+            prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}"
+
        try:
            # Use Ollama's chat API with vision
            response = self._client.chat(
@@ -123,7 +128,8 @@ class VisionProcessor:
        frames_info: List[Tuple[str, float]],
        context: str = "meeting",
        deduplicate: bool = True,
-        similarity_threshold: float = 0.85
+        similarity_threshold: float = 0.85,
+        audio_segments: Optional[List[Dict]] = None
    ) -> List[Dict]:
        """
        Process multiple frames with vision analysis.
@@ -146,7 +152,10 @@ class VisionProcessor:
        for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
            logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")

-            text = self.analyze_frame(frame_path, context)
+            # Get audio context around this timestamp (±30 seconds)
+            audio_context = self._get_audio_context(timestamp, audio_segments, window=30)
+
+            text = self.analyze_frame(frame_path, context, audio_context)

            if not text:
                logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
@@ -170,6 +179,29 @@ class VisionProcessor:
        logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
        return results

+    def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str:
+        """
+        Get audio transcript around a given timestamp.
+
+        Args:
+            timestamp: Target timestamp in seconds
+            audio_segments: List of audio segments with 'timestamp' and 'text' keys
+            window: Time window in seconds (±window around timestamp)
+
+        Returns:
+            Concatenated audio text from the time window
+        """
+        if not audio_segments:
+            return ""
+
+        relevant = [seg for seg in audio_segments
+                    if abs(seg.get('timestamp', 0) - timestamp) <= window]
+
+        if not relevant:
+            return ""
+
+        return " ".join([seg['text'] for seg in relevant])
+
    def _text_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate similarity between two texts.