group text
This commit is contained in:
@@ -83,13 +83,14 @@ class VisionProcessor:
|
||||
logger.warning(f"Prompt file not found: {prompt_file}, using default")
|
||||
return "Analyze this image and describe what you see in detail."
|
||||
|
||||
def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
|
||||
def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str:
|
||||
"""
|
||||
Analyze a single frame using local vision model.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
context: Context hint for analysis (meeting, dashboard, code, console)
|
||||
audio_context: Optional audio transcript around this timestamp for context
|
||||
|
||||
Returns:
|
||||
Analyzed content description
|
||||
@@ -97,6 +98,10 @@ class VisionProcessor:
|
||||
# Load prompt from file
|
||||
prompt = self._load_prompt(context)
|
||||
|
||||
# Add audio context if available
|
||||
if audio_context:
|
||||
prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}"
|
||||
|
||||
try:
|
||||
# Use Ollama's chat API with vision
|
||||
response = self._client.chat(
|
||||
@@ -123,7 +128,8 @@ class VisionProcessor:
|
||||
frames_info: List[Tuple[str, float]],
|
||||
context: str = "meeting",
|
||||
deduplicate: bool = True,
|
||||
similarity_threshold: float = 0.85
|
||||
similarity_threshold: float = 0.85,
|
||||
audio_segments: Optional[List[Dict]] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Process multiple frames with vision analysis.
|
||||
@@ -146,7 +152,10 @@ class VisionProcessor:
|
||||
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
|
||||
logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
|
||||
|
||||
text = self.analyze_frame(frame_path, context)
|
||||
# Get audio context around this timestamp (±30 seconds)
|
||||
audio_context = self._get_audio_context(timestamp, audio_segments, window=30)
|
||||
|
||||
text = self.analyze_frame(frame_path, context, audio_context)
|
||||
|
||||
if not text:
|
||||
logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
|
||||
@@ -170,6 +179,29 @@ class VisionProcessor:
|
||||
logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
|
||||
return results
|
||||
|
||||
def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str:
|
||||
"""
|
||||
Get audio transcript around a given timestamp.
|
||||
|
||||
Args:
|
||||
timestamp: Target timestamp in seconds
|
||||
audio_segments: List of audio segments with 'timestamp' and 'text' keys
|
||||
window: Time window in seconds (±window around timestamp)
|
||||
|
||||
Returns:
|
||||
Concatenated audio text from the time window
|
||||
"""
|
||||
if not audio_segments:
|
||||
return ""
|
||||
|
||||
relevant = [seg for seg in audio_segments
|
||||
if abs(seg.get('timestamp', 0) - timestamp) <= window]
|
||||
|
||||
if not relevant:
|
||||
return ""
|
||||
|
||||
return " ".join([seg['text'] for seg in relevant])
|
||||
|
||||
def _text_similarity(self, text1: str, text2: str) -> float:
|
||||
"""
|
||||
Calculate similarity between two texts.
|
||||
|
||||
Reference in New Issue
Block a user