add vision processor
This commit is contained in:
@@ -13,6 +13,7 @@ import shutil
|
||||
|
||||
from meetus.frame_extractor import FrameExtractor
|
||||
from meetus.ocr_processor import OCRProcessor
|
||||
from meetus.vision_processor import VisionProcessor
|
||||
from meetus.transcript_merger import TranscriptMerger
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -98,20 +99,23 @@ def main():
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run Whisper + full processing in one command
|
||||
# Run Whisper + vision analysis (recommended for code/dashboards)
|
||||
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision
|
||||
|
||||
# Use vision with specific context hint
|
||||
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --vision-context code
|
||||
|
||||
# Traditional OCR approach
|
||||
python process_meeting.py samples/meeting.mkv --run-whisper
|
||||
|
||||
# Process video with existing Whisper transcript
|
||||
python process_meeting.py samples/meeting.mkv --transcript output/meeting.json
|
||||
# Re-run analysis using cached frames and transcript
|
||||
python process_meeting.py samples/meeting.mkv --use-vision
|
||||
|
||||
# Use scene detection instead of interval
|
||||
python process_meeting.py samples/meeting.mkv --run-whisper --scene-detection
|
||||
# Force reprocessing (ignore cache)
|
||||
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --no-cache
|
||||
|
||||
# Use different Whisper model and OCR engine
|
||||
python process_meeting.py samples/meeting.mkv --run-whisper --whisper-model small --ocr-engine easyocr
|
||||
|
||||
# Extract frames only (no transcript)
|
||||
python process_meeting.py samples/meeting.mkv --extract-only
|
||||
# Use scene detection for fewer frames
|
||||
python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --scene-detection
|
||||
"""
|
||||
)
|
||||
|
||||
@@ -177,6 +181,31 @@ Examples:
|
||||
default='tesseract'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--use-vision',
|
||||
action='store_true',
|
||||
help='Use local vision model (Ollama) instead of OCR for better context understanding'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--vision-model',
|
||||
help='Vision model to use with Ollama (default: llava:13b)',
|
||||
default='llava:13b'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--vision-context',
|
||||
choices=['meeting', 'dashboard', 'code', 'console'],
|
||||
help='Context hint for vision analysis (default: meeting)',
|
||||
default='meeting'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-cache',
|
||||
action='store_true',
|
||||
help='Disable caching - reprocess everything even if outputs exist'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-deduplicate',
|
||||
action='store_true',
|
||||
@@ -221,61 +250,113 @@ Examples:
|
||||
if args.output is None:
|
||||
args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
|
||||
|
||||
# Run Whisper if requested
|
||||
# Define cache paths
|
||||
whisper_cache = output_dir / f"{video_path.stem}.json"
|
||||
analysis_cache = output_dir / f"{video_path.stem}_{'vision' if args.use_vision else 'ocr'}.json"
|
||||
frames_cache_dir = Path(args.frames_dir)
|
||||
|
||||
# Check for cached Whisper transcript
|
||||
if args.run_whisper:
|
||||
logger.info("=" * 80)
|
||||
logger.info("STEP 0: Running Whisper Transcription")
|
||||
logger.info("=" * 80)
|
||||
transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
|
||||
args.transcript = str(transcript_path)
|
||||
logger.info("")
|
||||
if not args.no_cache and whisper_cache.exists():
|
||||
logger.info(f"✓ Found cached Whisper transcript: {whisper_cache}")
|
||||
args.transcript = str(whisper_cache)
|
||||
else:
|
||||
logger.info("=" * 80)
|
||||
logger.info("STEP 0: Running Whisper Transcription")
|
||||
logger.info("=" * 80)
|
||||
transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
|
||||
args.transcript = str(transcript_path)
|
||||
logger.info("")
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("MEETING PROCESSOR")
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Video: {video_path.name}")
|
||||
logger.info(f"OCR Engine: {args.ocr_engine}")
|
||||
logger.info(f"Analysis: {'Vision Model' if args.use_vision else f'OCR ({args.ocr_engine})'}")
|
||||
if args.use_vision:
|
||||
logger.info(f"Vision Model: {args.vision_model}")
|
||||
logger.info(f"Context: {args.vision_context}")
|
||||
logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
|
||||
if args.transcript:
|
||||
logger.info(f"Transcript: {args.transcript}")
|
||||
logger.info(f"Caching: {'Disabled' if args.no_cache else 'Enabled'}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
# Step 1: Extract frames
|
||||
# Step 1: Extract frames (with caching)
|
||||
logger.info("Step 1: Extracting frames from video...")
|
||||
extractor = FrameExtractor(str(video_path), args.frames_dir)
|
||||
|
||||
if args.scene_detection:
|
||||
frames_info = extractor.extract_scene_changes()
|
||||
# Check if frames already exist
|
||||
existing_frames = list(frames_cache_dir.glob(f"{video_path.stem}_*.jpg")) if frames_cache_dir.exists() else []
|
||||
|
||||
if not args.no_cache and existing_frames and len(existing_frames) > 0:
|
||||
logger.info(f"✓ Found {len(existing_frames)} cached frames in {args.frames_dir}/")
|
||||
# Build frames_info from existing files
|
||||
frames_info = []
|
||||
for frame_path in sorted(existing_frames):
|
||||
# Try to extract timestamp from filename (e.g., video_00001_12.34s.jpg)
|
||||
try:
|
||||
timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
|
||||
timestamp = float(timestamp_str)
|
||||
except:
|
||||
timestamp = 0.0
|
||||
frames_info.append((str(frame_path), timestamp))
|
||||
else:
|
||||
frames_info = extractor.extract_by_interval(args.interval)
|
||||
extractor = FrameExtractor(str(video_path), args.frames_dir)
|
||||
|
||||
if not frames_info:
|
||||
logger.error("No frames extracted")
|
||||
sys.exit(1)
|
||||
if args.scene_detection:
|
||||
frames_info = extractor.extract_scene_changes()
|
||||
else:
|
||||
frames_info = extractor.extract_by_interval(args.interval)
|
||||
|
||||
logger.info(f"✓ Extracted {len(frames_info)} frames")
|
||||
if not frames_info:
|
||||
logger.error("No frames extracted")
|
||||
sys.exit(1)
|
||||
|
||||
# Step 2: Run OCR on frames
|
||||
logger.info("Step 2: Running OCR on extracted frames...")
|
||||
try:
|
||||
ocr = OCRProcessor(engine=args.ocr_engine)
|
||||
screen_segments = ocr.process_frames(
|
||||
frames_info,
|
||||
deduplicate=not args.no_deduplicate
|
||||
)
|
||||
logger.info(f"✓ Processed {len(screen_segments)} frames with text content")
|
||||
logger.info(f"✓ Extracted {len(frames_info)} frames")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"{e}")
|
||||
logger.error(f"To install {args.ocr_engine}:")
|
||||
logger.error(f" pip install {args.ocr_engine}")
|
||||
sys.exit(1)
|
||||
# Step 2: Run analysis on frames (with caching)
|
||||
if not args.no_cache and analysis_cache.exists():
|
||||
logger.info(f"✓ Found cached analysis results: {analysis_cache}")
|
||||
with open(analysis_cache, 'r', encoding='utf-8') as f:
|
||||
screen_segments = json.load(f)
|
||||
logger.info(f"✓ Loaded {len(screen_segments)} analyzed frames from cache")
|
||||
else:
|
||||
if args.use_vision:
|
||||
# Use vision model
|
||||
logger.info("Step 2: Running vision analysis on extracted frames...")
|
||||
try:
|
||||
vision = VisionProcessor(model=args.vision_model)
|
||||
screen_segments = vision.process_frames(
|
||||
frames_info,
|
||||
context=args.vision_context,
|
||||
deduplicate=not args.no_deduplicate
|
||||
)
|
||||
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
|
||||
|
||||
# Save OCR results as JSON
|
||||
ocr_output = output_dir / f"{video_path.stem}_ocr.json"
|
||||
with open(ocr_output, 'w', encoding='utf-8') as f:
|
||||
json.dump(screen_segments, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"✓ Saved OCR results to: {ocr_output}")
|
||||
except ImportError as e:
|
||||
logger.error(f"{e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Use OCR
|
||||
logger.info("Step 2: Running OCR on extracted frames...")
|
||||
try:
|
||||
ocr = OCRProcessor(engine=args.ocr_engine)
|
||||
screen_segments = ocr.process_frames(
|
||||
frames_info,
|
||||
deduplicate=not args.no_deduplicate
|
||||
)
|
||||
logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"{e}")
|
||||
logger.error(f"To install {args.ocr_engine}:")
|
||||
logger.error(f" pip install {args.ocr_engine}")
|
||||
sys.exit(1)
|
||||
|
||||
# Save analysis results as JSON
|
||||
with open(analysis_cache, 'w', encoding='utf-8') as f:
|
||||
json.dump(screen_segments, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"✓ Saved analysis results to: {analysis_cache}")
|
||||
|
||||
if args.extract_only:
|
||||
logger.info("Done! (extract-only mode)")
|
||||
|
||||
Reference in New Issue
Block a user