add whisperx support

This commit is contained in:
Mariano Gabriel
2025-12-03 06:48:45 -03:00
parent 7b919beda6
commit 7d7ec15ff7
4 changed files with 87 additions and 16 deletions

View File

@@ -70,8 +70,10 @@ class TranscriptMerger:
for seg in data
]
# Group by interval if requested
if group_interval and segments:
# Group by interval if requested, but skip if we have speaker diarization
# (merge_transcripts will group by speaker instead)
has_speakers = any(seg.get('speaker') for seg in segments)
if group_interval and segments and not has_speakers:
segments = self.group_audio_by_intervals(segments, group_interval)
return segments
@@ -164,13 +166,14 @@ class TranscriptMerger:
) -> List[Dict]:
"""
Merge audio and screen transcripts by timestamp.
Groups consecutive audio from same speaker until a screen frame interrupts.
Args:
audio_segments: List of audio transcript segments
screen_segments: List of screen OCR segments
Returns:
Merged list sorted by timestamp
Merged list sorted by timestamp, with audio grouped by speaker
"""
# Mark segment types
for seg in audio_segments:
@@ -182,7 +185,46 @@ class TranscriptMerger:
all_segments = audio_segments + screen_segments
all_segments.sort(key=lambda x: x['timestamp'])
return all_segments
# Group consecutive audio segments by speaker (screen frames break groups)
grouped = []
current_group = None
for seg in all_segments:
if seg['type'] == 'screen':
# Screen frame: flush current group and add frame
if current_group:
grouped.append(current_group)
current_group = None
grouped.append(seg)
else:
# Audio segment
speaker = seg.get('speaker')
if current_group is None:
# Start new group
current_group = {
'timestamp': seg['timestamp'],
'text': seg['text'],
'speaker': speaker,
'type': 'audio'
}
elif speaker == current_group.get('speaker'):
# Same speaker, append text
current_group['text'] += ' ' + seg['text']
else:
# Speaker changed, flush and start new group
grouped.append(current_group)
current_group = {
'timestamp': seg['timestamp'],
'text': seg['text'],
'speaker': speaker,
'type': 'audio'
}
# Don't forget last group
if current_group:
grouped.append(current_group)
return grouped
def format_for_claude(
self,