group text

This commit is contained in:
Mariano Gabriel
2025-10-23 14:49:14 -03:00
parent cdf7ad1199
commit c871af2def
3 changed files with 111 additions and 8 deletions

View File

@@ -17,7 +17,7 @@ class TranscriptMerger:
"""Initialize transcript merger."""
pass
def load_whisper_transcript(self, transcript_path: str) -> List[Dict]:
def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]:
"""
Load Whisper transcript from file.
@@ -25,6 +25,7 @@ class TranscriptMerger:
Args:
transcript_path: Path to transcript file
group_interval: If specified, group audio segments into intervals (in seconds)
Returns:
List of dicts with 'timestamp' (optional) and 'text'
@@ -36,9 +37,10 @@ class TranscriptMerger:
data = json.load(f)
# Handle different Whisper output formats
segments = []
if isinstance(data, dict) and 'segments' in data:
# Standard Whisper JSON format
return [
segments = [
{
'timestamp': seg.get('start', 0),
'text': seg['text'].strip(),
@@ -48,7 +50,7 @@ class TranscriptMerger:
]
elif isinstance(data, list):
# List of segments
return [
segments = [
{
'timestamp': seg.get('start', seg.get('timestamp', 0)),
'text': seg['text'].strip(),
@@ -57,6 +59,12 @@ class TranscriptMerger:
for seg in data
]
# Group by interval if requested
if group_interval and segments:
segments = self.group_audio_by_intervals(segments, group_interval)
return segments
else:
# Plain text file - no timestamps
with open(path, 'r', encoding='utf-8') as f:
@@ -68,6 +76,50 @@ class TranscriptMerger:
'type': 'audio'
}]
def group_audio_by_intervals(self, segments: List[Dict], interval_seconds: int = 30) -> List[Dict]:
"""
Group audio segments into regular time intervals.
Instead of word-level timestamps, this creates intervals (e.g., every 30 seconds)
with all text spoken during that interval concatenated together.
Args:
segments: List of audio segments with timestamps
interval_seconds: Duration of each interval in seconds
Returns:
List of grouped segments with interval timestamps
"""
if not segments:
return []
# Find the max timestamp to determine how many intervals we need
max_timestamp = max(seg['timestamp'] for seg in segments)
num_intervals = int(max_timestamp / interval_seconds) + 1
# Create interval buckets
intervals = []
for i in range(num_intervals):
interval_start = i * interval_seconds
interval_end = (i + 1) * interval_seconds
# Collect all text in this interval
texts = []
for seg in segments:
if interval_start <= seg['timestamp'] < interval_end:
texts.append(seg['text'])
# Only create interval if there's text
if texts:
intervals.append({
'timestamp': interval_start,
'text': ' '.join(texts),
'type': 'audio'
})
logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s")
return intervals
def merge_transcripts(
self,
audio_segments: List[Dict],