embed images

2025-10-28 08:02:45 -03:00
parent b1e1daf278
commit 118ef04223
12 changed files with 1016 additions and 61 deletions
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -32,23 +32,20 @@ def main():
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
-  # Run Whisper + vision analysis (recommended for code/dashboards)
-  python process_meeting.py samples/meeting.mkv --run-whisper --use-vision
+  # Embed images for LLM analysis (recommended - let LLM analyze actual frames)
+  python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection

-  # Use vision with specific context hint
-  python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --vision-context code
+  # Embed with custom quality (lower = smaller file size)
+  python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection

-  # Traditional OCR approach
-  python process_meeting.py samples/meeting.mkv --run-whisper
+  # Hybrid approach: OpenCV + OCR (extracts text, no images)
+  python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection

-  # Re-run analysis using cached frames and transcript
-  python process_meeting.py samples/meeting.mkv --use-vision
+  # Hybrid + LLM cleanup (best for code formatting)
+  python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --hybrid-llm-cleanup --scene-detection

-  # Force reprocessing (ignore cache)
-  python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --no-cache
-
-  # Use scene detection for fewer frames
-  python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --scene-detection
+  # Iterate on scene threshold (reuse whisper transcript)
+  python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis
        """
    )

@@ -119,6 +116,21 @@ Examples:
        action='store_true',
        help='Use local vision model (Ollama) instead of OCR for better context understanding'
    )
+    parser.add_argument(
+        '--use-hybrid',
+        action='store_true',
+        help='Use hybrid approach: OpenCV text detection + OCR (more accurate than vision models)'
+    )
+    parser.add_argument(
+        '--hybrid-llm-cleanup',
+        action='store_true',
+        help='Use LLM to clean up OCR output and preserve code formatting (requires --use-hybrid)'
+    )
+    parser.add_argument(
+        '--hybrid-llm-model',
+        help='LLM model for cleanup (default: llama3.2:3b)',
+        default='llama3.2:3b'
+    )
    parser.add_argument(
        '--vision-model',
        help='Vision model to use with Ollama (default: llava:13b)',
@@ -168,6 +180,17 @@ Examples:
        help='Output format style (default: detailed)',
        default='detailed'
    )
+    parser.add_argument(
+        '--embed-images',
+        action='store_true',
+        help='Embed frame images (as base64) in enhanced transcript for LLM analysis'
+    )
+    parser.add_argument(
+        '--embed-quality',
+        type=int,
+        help='JPEG quality for embedded images (default: 80, lower = smaller file)',
+        default=80
+    )

    # Logging
    parser.add_argument(