group text

This commit is contained in:
Mariano Gabriel
2025-10-23 14:49:14 -03:00
parent cdf7ad1199
commit c871af2def
3 changed files with 111 additions and 8 deletions

View File

@@ -83,13 +83,14 @@ class VisionProcessor:
logger.warning(f"Prompt file not found: {prompt_file}, using default")
return "Analyze this image and describe what you see in detail."
def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str:
"""
Analyze a single frame using local vision model.
Args:
image_path: Path to image file
context: Context hint for analysis (meeting, dashboard, code, console)
audio_context: Optional audio transcript around this timestamp for context
Returns:
Analyzed content description
@@ -97,6 +98,10 @@ class VisionProcessor:
# Load prompt from file
prompt = self._load_prompt(context)
# Add audio context if available
if audio_context:
prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}"
try:
# Use Ollama's chat API with vision
response = self._client.chat(
@@ -123,7 +128,8 @@ class VisionProcessor:
frames_info: List[Tuple[str, float]],
context: str = "meeting",
deduplicate: bool = True,
similarity_threshold: float = 0.85
similarity_threshold: float = 0.85,
audio_segments: Optional[List[Dict]] = None
) -> List[Dict]:
"""
Process multiple frames with vision analysis.
@@ -146,7 +152,10 @@ class VisionProcessor:
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
text = self.analyze_frame(frame_path, context)
# Get audio context around this timestamp (±30 seconds)
audio_context = self._get_audio_context(timestamp, audio_segments, window=30)
text = self.analyze_frame(frame_path, context, audio_context)
if not text:
logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
@@ -170,6 +179,29 @@ class VisionProcessor:
logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
return results
def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str:
"""
Get audio transcript around a given timestamp.
Args:
timestamp: Target timestamp in seconds
audio_segments: List of audio segments with 'timestamp' and 'text' keys
window: Time window in seconds (±window around timestamp)
Returns:
Concatenated audio text from the time window
"""
if not audio_segments:
return ""
relevant = [seg for seg in audio_segments
if abs(seg.get('timestamp', 0) - timestamp) <= window]
if not relevant:
return ""
return " ".join([seg['text'] for seg in relevant])
def _text_similarity(self, text1: str, text2: str) -> float:
"""
Calculate similarity between two texts.