add whisperx support
This commit is contained in:
@@ -70,8 +70,10 @@ class TranscriptMerger:
|
||||
for seg in data
|
||||
]
|
||||
|
||||
# Group by interval if requested
|
||||
if group_interval and segments:
|
||||
# Group by interval if requested, but skip if we have speaker diarization
|
||||
# (merge_transcripts will group by speaker instead)
|
||||
has_speakers = any(seg.get('speaker') for seg in segments)
|
||||
if group_interval and segments and not has_speakers:
|
||||
segments = self.group_audio_by_intervals(segments, group_interval)
|
||||
|
||||
return segments
|
||||
@@ -164,13 +166,14 @@ class TranscriptMerger:
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Merge audio and screen transcripts by timestamp.
|
||||
Groups consecutive audio from same speaker until a screen frame interrupts.
|
||||
|
||||
Args:
|
||||
audio_segments: List of audio transcript segments
|
||||
screen_segments: List of screen OCR segments
|
||||
|
||||
Returns:
|
||||
Merged list sorted by timestamp
|
||||
Merged list sorted by timestamp, with audio grouped by speaker
|
||||
"""
|
||||
# Mark segment types
|
||||
for seg in audio_segments:
|
||||
@@ -182,7 +185,46 @@ class TranscriptMerger:
|
||||
all_segments = audio_segments + screen_segments
|
||||
all_segments.sort(key=lambda x: x['timestamp'])
|
||||
|
||||
return all_segments
|
||||
# Group consecutive audio segments by speaker (screen frames break groups)
|
||||
grouped = []
|
||||
current_group = None
|
||||
|
||||
for seg in all_segments:
|
||||
if seg['type'] == 'screen':
|
||||
# Screen frame: flush current group and add frame
|
||||
if current_group:
|
||||
grouped.append(current_group)
|
||||
current_group = None
|
||||
grouped.append(seg)
|
||||
else:
|
||||
# Audio segment
|
||||
speaker = seg.get('speaker')
|
||||
if current_group is None:
|
||||
# Start new group
|
||||
current_group = {
|
||||
'timestamp': seg['timestamp'],
|
||||
'text': seg['text'],
|
||||
'speaker': speaker,
|
||||
'type': 'audio'
|
||||
}
|
||||
elif speaker == current_group.get('speaker'):
|
||||
# Same speaker, append text
|
||||
current_group['text'] += ' ' + seg['text']
|
||||
else:
|
||||
# Speaker changed, flush and start new group
|
||||
grouped.append(current_group)
|
||||
current_group = {
|
||||
'timestamp': seg['timestamp'],
|
||||
'text': seg['text'],
|
||||
'speaker': speaker,
|
||||
'type': 'audio'
|
||||
}
|
||||
|
||||
# Don't forget last group
|
||||
if current_group:
|
||||
grouped.append(current_group)
|
||||
|
||||
return grouped
|
||||
|
||||
def format_for_claude(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user