group text
This commit is contained in:
@@ -17,7 +17,7 @@ class TranscriptMerger:
|
|||||||
"""Initialize transcript merger."""
|
"""Initialize transcript merger."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def load_whisper_transcript(self, transcript_path: str) -> List[Dict]:
|
def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
Load Whisper transcript from file.
|
Load Whisper transcript from file.
|
||||||
|
|
||||||
@@ -25,6 +25,7 @@ class TranscriptMerger:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
transcript_path: Path to transcript file
|
transcript_path: Path to transcript file
|
||||||
|
group_interval: If specified, group audio segments into intervals (in seconds)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of dicts with 'timestamp' (optional) and 'text'
|
List of dicts with 'timestamp' (optional) and 'text'
|
||||||
@@ -36,9 +37,10 @@ class TranscriptMerger:
|
|||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|
||||||
# Handle different Whisper output formats
|
# Handle different Whisper output formats
|
||||||
|
segments = []
|
||||||
if isinstance(data, dict) and 'segments' in data:
|
if isinstance(data, dict) and 'segments' in data:
|
||||||
# Standard Whisper JSON format
|
# Standard Whisper JSON format
|
||||||
return [
|
segments = [
|
||||||
{
|
{
|
||||||
'timestamp': seg.get('start', 0),
|
'timestamp': seg.get('start', 0),
|
||||||
'text': seg['text'].strip(),
|
'text': seg['text'].strip(),
|
||||||
@@ -48,7 +50,7 @@ class TranscriptMerger:
|
|||||||
]
|
]
|
||||||
elif isinstance(data, list):
|
elif isinstance(data, list):
|
||||||
# List of segments
|
# List of segments
|
||||||
return [
|
segments = [
|
||||||
{
|
{
|
||||||
'timestamp': seg.get('start', seg.get('timestamp', 0)),
|
'timestamp': seg.get('start', seg.get('timestamp', 0)),
|
||||||
'text': seg['text'].strip(),
|
'text': seg['text'].strip(),
|
||||||
@@ -57,6 +59,12 @@ class TranscriptMerger:
|
|||||||
for seg in data
|
for seg in data
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Group by interval if requested
|
||||||
|
if group_interval and segments:
|
||||||
|
segments = self.group_audio_by_intervals(segments, group_interval)
|
||||||
|
|
||||||
|
return segments
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Plain text file - no timestamps
|
# Plain text file - no timestamps
|
||||||
with open(path, 'r', encoding='utf-8') as f:
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
@@ -68,6 +76,50 @@ class TranscriptMerger:
|
|||||||
'type': 'audio'
|
'type': 'audio'
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
def group_audio_by_intervals(self, segments: List[Dict], interval_seconds: int = 30) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Group audio segments into regular time intervals.
|
||||||
|
|
||||||
|
Instead of word-level timestamps, this creates intervals (e.g., every 30 seconds)
|
||||||
|
with all text spoken during that interval concatenated together.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
segments: List of audio segments with timestamps
|
||||||
|
interval_seconds: Duration of each interval in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of grouped segments with interval timestamps
|
||||||
|
"""
|
||||||
|
if not segments:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Find the max timestamp to determine how many intervals we need
|
||||||
|
max_timestamp = max(seg['timestamp'] for seg in segments)
|
||||||
|
num_intervals = int(max_timestamp / interval_seconds) + 1
|
||||||
|
|
||||||
|
# Create interval buckets
|
||||||
|
intervals = []
|
||||||
|
for i in range(num_intervals):
|
||||||
|
interval_start = i * interval_seconds
|
||||||
|
interval_end = (i + 1) * interval_seconds
|
||||||
|
|
||||||
|
# Collect all text in this interval
|
||||||
|
texts = []
|
||||||
|
for seg in segments:
|
||||||
|
if interval_start <= seg['timestamp'] < interval_end:
|
||||||
|
texts.append(seg['text'])
|
||||||
|
|
||||||
|
# Only create interval if there's text
|
||||||
|
if texts:
|
||||||
|
intervals.append({
|
||||||
|
'timestamp': interval_start,
|
||||||
|
'text': ' '.join(texts),
|
||||||
|
'type': 'audio'
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s")
|
||||||
|
return intervals
|
||||||
|
|
||||||
def merge_transcripts(
|
def merge_transcripts(
|
||||||
self,
|
self,
|
||||||
audio_segments: List[Dict],
|
audio_segments: List[Dict],
|
||||||
|
|||||||
@@ -83,13 +83,14 @@ class VisionProcessor:
|
|||||||
logger.warning(f"Prompt file not found: {prompt_file}, using default")
|
logger.warning(f"Prompt file not found: {prompt_file}, using default")
|
||||||
return "Analyze this image and describe what you see in detail."
|
return "Analyze this image and describe what you see in detail."
|
||||||
|
|
||||||
def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
|
def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str:
|
||||||
"""
|
"""
|
||||||
Analyze a single frame using local vision model.
|
Analyze a single frame using local vision model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_path: Path to image file
|
image_path: Path to image file
|
||||||
context: Context hint for analysis (meeting, dashboard, code, console)
|
context: Context hint for analysis (meeting, dashboard, code, console)
|
||||||
|
audio_context: Optional audio transcript around this timestamp for context
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Analyzed content description
|
Analyzed content description
|
||||||
@@ -97,6 +98,10 @@ class VisionProcessor:
|
|||||||
# Load prompt from file
|
# Load prompt from file
|
||||||
prompt = self._load_prompt(context)
|
prompt = self._load_prompt(context)
|
||||||
|
|
||||||
|
# Add audio context if available
|
||||||
|
if audio_context:
|
||||||
|
prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use Ollama's chat API with vision
|
# Use Ollama's chat API with vision
|
||||||
response = self._client.chat(
|
response = self._client.chat(
|
||||||
@@ -123,7 +128,8 @@ class VisionProcessor:
|
|||||||
frames_info: List[Tuple[str, float]],
|
frames_info: List[Tuple[str, float]],
|
||||||
context: str = "meeting",
|
context: str = "meeting",
|
||||||
deduplicate: bool = True,
|
deduplicate: bool = True,
|
||||||
similarity_threshold: float = 0.85
|
similarity_threshold: float = 0.85,
|
||||||
|
audio_segments: Optional[List[Dict]] = None
|
||||||
) -> List[Dict]:
|
) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
Process multiple frames with vision analysis.
|
Process multiple frames with vision analysis.
|
||||||
@@ -146,7 +152,10 @@ class VisionProcessor:
|
|||||||
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
|
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
|
||||||
logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
|
logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
|
||||||
|
|
||||||
text = self.analyze_frame(frame_path, context)
|
# Get audio context around this timestamp (±30 seconds)
|
||||||
|
audio_context = self._get_audio_context(timestamp, audio_segments, window=30)
|
||||||
|
|
||||||
|
text = self.analyze_frame(frame_path, context, audio_context)
|
||||||
|
|
||||||
if not text:
|
if not text:
|
||||||
logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
|
logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
|
||||||
@@ -170,6 +179,29 @@ class VisionProcessor:
|
|||||||
logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
|
logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str:
|
||||||
|
"""
|
||||||
|
Get audio transcript around a given timestamp.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timestamp: Target timestamp in seconds
|
||||||
|
audio_segments: List of audio segments with 'timestamp' and 'text' keys
|
||||||
|
window: Time window in seconds (±window around timestamp)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Concatenated audio text from the time window
|
||||||
|
"""
|
||||||
|
if not audio_segments:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
relevant = [seg for seg in audio_segments
|
||||||
|
if abs(seg.get('timestamp', 0) - timestamp) <= window]
|
||||||
|
|
||||||
|
if not relevant:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return " ".join([seg['text'] for seg in relevant])
|
||||||
|
|
||||||
def _text_similarity(self, text1: str, text2: str) -> float:
|
def _text_similarity(self, text1: str, text2: str) -> float:
|
||||||
"""
|
"""
|
||||||
Calculate similarity between two texts.
|
Calculate similarity between two texts.
|
||||||
|
|||||||
@@ -236,12 +236,25 @@ class ProcessingWorkflow:
|
|||||||
logger.info("Step 2: Running vision analysis on extracted frames...")
|
logger.info("Step 2: Running vision analysis on extracted frames...")
|
||||||
logger.info(f"Loading vision model {self.config.vision_model} to GPU...")
|
logger.info(f"Loading vision model {self.config.vision_model} to GPU...")
|
||||||
|
|
||||||
|
# Load audio segments for context if transcript exists
|
||||||
|
audio_segments = []
|
||||||
|
transcript_path = self.config.transcript_path or self._get_cached_transcript()
|
||||||
|
|
||||||
|
if transcript_path:
|
||||||
|
transcript_file = Path(transcript_path)
|
||||||
|
if transcript_file.exists():
|
||||||
|
logger.info("Loading audio transcript for context...")
|
||||||
|
merger = TranscriptMerger()
|
||||||
|
audio_segments = merger.load_whisper_transcript(str(transcript_file))
|
||||||
|
logger.info(f"✓ Loaded {len(audio_segments)} audio segments for context")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
vision = VisionProcessor(model=self.config.vision_model)
|
vision = VisionProcessor(model=self.config.vision_model)
|
||||||
screen_segments = vision.process_frames(
|
screen_segments = vision.process_frames(
|
||||||
frames_info,
|
frames_info,
|
||||||
context=self.config.vision_context,
|
context=self.config.vision_context,
|
||||||
deduplicate=not self.config.no_deduplicate
|
deduplicate=not self.config.no_deduplicate,
|
||||||
|
audio_segments=audio_segments
|
||||||
)
|
)
|
||||||
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
|
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
|
||||||
|
|
||||||
@@ -253,6 +266,11 @@ class ProcessingWorkflow:
|
|||||||
logger.error(f"{e}")
|
logger.error(f"{e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def _get_cached_transcript(self) -> Optional[str]:
|
||||||
|
"""Get cached Whisper transcript if available."""
|
||||||
|
cached = self.cache_mgr.get_whisper_cache()
|
||||||
|
return str(cached) if cached else None
|
||||||
|
|
||||||
def _run_ocr_analysis(self, frames_info):
|
def _run_ocr_analysis(self, frames_info):
|
||||||
"""Run OCR analysis on frames."""
|
"""Run OCR analysis on frames."""
|
||||||
logger.info("Step 2: Running OCR on extracted frames...")
|
logger.info("Step 2: Running OCR on extracted frames...")
|
||||||
@@ -289,7 +307,8 @@ class ProcessingWorkflow:
|
|||||||
logger.warning(f"Transcript not found: {transcript_path}")
|
logger.warning(f"Transcript not found: {transcript_path}")
|
||||||
logger.info("Proceeding with screen content only...")
|
logger.info("Proceeding with screen content only...")
|
||||||
else:
|
else:
|
||||||
audio_segments = merger.load_whisper_transcript(str(transcript_file))
|
# Group audio into 30-second intervals for cleaner reference timestamps
|
||||||
|
audio_segments = merger.load_whisper_transcript(str(transcript_file), group_interval=30)
|
||||||
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
|
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
|
||||||
else:
|
else:
|
||||||
logger.info("No transcript provided, using screen content only...")
|
logger.info("No transcript provided, using screen content only...")
|
||||||
|
|||||||
Reference in New Issue
Block a user