embed images

This commit is contained in:
Mariano Gabriel
2025-10-28 08:02:45 -03:00
parent b1e1daf278
commit 118ef04223
12 changed files with 1016 additions and 61 deletions

View File

@@ -32,23 +32,20 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Run Whisper + vision analysis (recommended for code/dashboards)
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision
# Embed images for LLM analysis (recommended - let LLM analyze actual frames)
python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection
# Use vision with specific context hint
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --vision-context code
# Embed with custom quality (lower = smaller file size)
python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection
# Traditional OCR approach
python process_meeting.py samples/meeting.mkv --run-whisper
# Hybrid approach: OpenCV + OCR (extracts text, no images)
python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection
# Re-run analysis using cached frames and transcript
python process_meeting.py samples/meeting.mkv --use-vision
# Hybrid + LLM cleanup (best for code formatting)
python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --hybrid-llm-cleanup --scene-detection
# Force reprocessing (ignore cache)
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --no-cache
# Use scene detection for fewer frames
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --scene-detection
# Iterate on scene threshold (reuse whisper transcript)
python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis
"""
)
@@ -119,6 +116,21 @@ Examples:
action='store_true',
help='Use local vision model (Ollama) instead of OCR for better context understanding'
)
parser.add_argument(
'--use-hybrid',
action='store_true',
help='Use hybrid approach: OpenCV text detection + OCR (more accurate than vision models)'
)
parser.add_argument(
'--hybrid-llm-cleanup',
action='store_true',
help='Use LLM to clean up OCR output and preserve code formatting (requires --use-hybrid)'
)
parser.add_argument(
'--hybrid-llm-model',
help='LLM model for cleanup (default: llama3.2:3b)',
default='llama3.2:3b'
)
parser.add_argument(
'--vision-model',
help='Vision model to use with Ollama (default: llava:13b)',
@@ -168,6 +180,17 @@ Examples:
help='Output format style (default: detailed)',
default='detailed'
)
parser.add_argument(
'--embed-images',
action='store_true',
help='Embed frame images (as base64) in enhanced transcript for LLM analysis'
)
parser.add_argument(
'--embed-quality',
type=int,
help='JPEG quality for embedded images (default: 80, lower = smaller file)',
default=80
)
# Logging
parser.add_argument(