From c871af2def6b5b495989127c51ff4757c56a4877 Mon Sep 17 00:00:00 2001 From: Mariano Gabriel Date: Thu, 23 Oct 2025 14:49:14 -0300 Subject: [PATCH] group text --- meetus/transcript_merger.py | 58 +++++++++++++++++++++++++++++++++++-- meetus/vision_processor.py | 38 ++++++++++++++++++++++-- meetus/workflow.py | 23 +++++++++++++-- 3 files changed, 111 insertions(+), 8 deletions(-) diff --git a/meetus/transcript_merger.py b/meetus/transcript_merger.py index 25770a2..14e99c5 100644 --- a/meetus/transcript_merger.py +++ b/meetus/transcript_merger.py @@ -17,7 +17,7 @@ class TranscriptMerger: """Initialize transcript merger.""" pass - def load_whisper_transcript(self, transcript_path: str) -> List[Dict]: + def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]: """ Load Whisper transcript from file. @@ -25,6 +25,7 @@ class TranscriptMerger: Args: transcript_path: Path to transcript file + group_interval: If specified, group audio segments into intervals (in seconds) Returns: List of dicts with 'timestamp' (optional) and 'text' @@ -36,9 +37,10 @@ class TranscriptMerger: data = json.load(f) # Handle different Whisper output formats + segments = [] if isinstance(data, dict) and 'segments' in data: # Standard Whisper JSON format - return [ + segments = [ { 'timestamp': seg.get('start', 0), 'text': seg['text'].strip(), @@ -48,7 +50,7 @@ class TranscriptMerger: ] elif isinstance(data, list): # List of segments - return [ + segments = [ { 'timestamp': seg.get('start', seg.get('timestamp', 0)), 'text': seg['text'].strip(), @@ -57,6 +59,12 @@ class TranscriptMerger: for seg in data ] + # Group by interval if requested + if group_interval and segments: + segments = self.group_audio_by_intervals(segments, group_interval) + + return segments + else: # Plain text file - no timestamps with open(path, 'r', encoding='utf-8') as f: @@ -68,6 +76,50 @@ class TranscriptMerger: 'type': 'audio' }] + def group_audio_by_intervals(self, segments: List[Dict], interval_seconds: int = 30) -> List[Dict]: + """ + Group audio segments into regular time intervals. + + Instead of word-level timestamps, this creates intervals (e.g., every 30 seconds) + with all text spoken during that interval concatenated together. + + Args: + segments: List of audio segments with timestamps + interval_seconds: Duration of each interval in seconds + + Returns: + List of grouped segments with interval timestamps + """ + if not segments: + return [] + + # Find the max timestamp to determine how many intervals we need + max_timestamp = max(seg['timestamp'] for seg in segments) + num_intervals = int(max_timestamp / interval_seconds) + 1 + + # Create interval buckets + intervals = [] + for i in range(num_intervals): + interval_start = i * interval_seconds + interval_end = (i + 1) * interval_seconds + + # Collect all text in this interval + texts = [] + for seg in segments: + if interval_start <= seg['timestamp'] < interval_end: + texts.append(seg['text']) + + # Only create interval if there's text + if texts: + intervals.append({ + 'timestamp': interval_start, + 'text': ' '.join(texts), + 'type': 'audio' + }) + + logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s") + return intervals + def merge_transcripts( self, audio_segments: List[Dict], diff --git a/meetus/vision_processor.py b/meetus/vision_processor.py index 9b4b4ca..ff42a43 100644 --- a/meetus/vision_processor.py +++ b/meetus/vision_processor.py @@ -83,13 +83,14 @@ class VisionProcessor: logger.warning(f"Prompt file not found: {prompt_file}, using default") return "Analyze this image and describe what you see in detail." - def analyze_frame(self, image_path: str, context: str = "meeting") -> str: + def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str: """ Analyze a single frame using local vision model. Args: image_path: Path to image file context: Context hint for analysis (meeting, dashboard, code, console) + audio_context: Optional audio transcript around this timestamp for context Returns: Analyzed content description @@ -97,6 +98,10 @@ class VisionProcessor: # Load prompt from file prompt = self._load_prompt(context) + # Add audio context if available + if audio_context: + prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}" + try: # Use Ollama's chat API with vision response = self._client.chat( @@ -123,7 +128,8 @@ class VisionProcessor: frames_info: List[Tuple[str, float]], context: str = "meeting", deduplicate: bool = True, - similarity_threshold: float = 0.85 + similarity_threshold: float = 0.85, + audio_segments: Optional[List[Dict]] = None ) -> List[Dict]: """ Process multiple frames with vision analysis. @@ -146,7 +152,10 @@ class VisionProcessor: for idx, (frame_path, timestamp) in enumerate(frames_info, 1): logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...") - text = self.analyze_frame(frame_path, context) + # Get audio context around this timestamp (±30 seconds) + audio_context = self._get_audio_context(timestamp, audio_segments, window=30) + + text = self.analyze_frame(frame_path, context, audio_context) if not text: logger.warning(f"No content extracted from frame at {timestamp:.2f}s") @@ -170,6 +179,29 @@ class VisionProcessor: logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})") return results + def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str: + """ + Get audio transcript around a given timestamp. + + Args: + timestamp: Target timestamp in seconds + audio_segments: List of audio segments with 'timestamp' and 'text' keys + window: Time window in seconds (±window around timestamp) + + Returns: + Concatenated audio text from the time window + """ + if not audio_segments: + return "" + + relevant = [seg for seg in audio_segments + if abs(seg.get('timestamp', 0) - timestamp) <= window] + + if not relevant: + return "" + + return " ".join([seg['text'] for seg in relevant]) + def _text_similarity(self, text1: str, text2: str) -> float: """ Calculate similarity between two texts. diff --git a/meetus/workflow.py b/meetus/workflow.py index acbae68..b7bb854 100644 --- a/meetus/workflow.py +++ b/meetus/workflow.py @@ -236,12 +236,25 @@ class ProcessingWorkflow: logger.info("Step 2: Running vision analysis on extracted frames...") logger.info(f"Loading vision model {self.config.vision_model} to GPU...") + # Load audio segments for context if transcript exists + audio_segments = [] + transcript_path = self.config.transcript_path or self._get_cached_transcript() + + if transcript_path: + transcript_file = Path(transcript_path) + if transcript_file.exists(): + logger.info("Loading audio transcript for context...") + merger = TranscriptMerger() + audio_segments = merger.load_whisper_transcript(str(transcript_file)) + logger.info(f"✓ Loaded {len(audio_segments)} audio segments for context") + try: vision = VisionProcessor(model=self.config.vision_model) screen_segments = vision.process_frames( frames_info, context=self.config.vision_context, - deduplicate=not self.config.no_deduplicate + deduplicate=not self.config.no_deduplicate, + audio_segments=audio_segments ) logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model") @@ -253,6 +266,11 @@ class ProcessingWorkflow: logger.error(f"{e}") raise + def _get_cached_transcript(self) -> Optional[str]: + """Get cached Whisper transcript if available.""" + cached = self.cache_mgr.get_whisper_cache() + return str(cached) if cached else None + def _run_ocr_analysis(self, frames_info): """Run OCR analysis on frames.""" logger.info("Step 2: Running OCR on extracted frames...") @@ -289,7 +307,8 @@ class ProcessingWorkflow: logger.warning(f"Transcript not found: {transcript_path}") logger.info("Proceeding with screen content only...") else: - audio_segments = merger.load_whisper_transcript(str(transcript_file)) + # Group audio into 30-second intervals for cleaner reference timestamps + audio_segments = merger.load_whisper_transcript(str(transcript_file), group_interval=30) logger.info(f"✓ Loaded {len(audio_segments)} audio segments") else: logger.info("No transcript provided, using screen content only...")