embed images
This commit is contained in:
@@ -53,20 +53,25 @@ class OCRProcessor:
|
||||
else:
|
||||
raise ValueError(f"Unknown OCR engine: {self.engine}")
|
||||
|
||||
def extract_text(self, image_path: str) -> str:
|
||||
def extract_text(self, image_path: str, preserve_layout: bool = True) -> str:
|
||||
"""
|
||||
Extract text from a single image.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
preserve_layout: Try to preserve whitespace and layout
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if self.engine == "tesseract":
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
image = Image.open(image_path)
|
||||
text = self._ocr_engine.image_to_string(image)
|
||||
|
||||
# Use PSM 6 (uniform block of text) to preserve layout better
|
||||
config = '--psm 6' if preserve_layout else ''
|
||||
text = pytesseract.image_to_string(image, config=config)
|
||||
|
||||
elif self.engine == "easyocr":
|
||||
result = self._ocr_engine.readtext(image_path, detail=0)
|
||||
@@ -81,12 +86,31 @@ class OCRProcessor:
|
||||
|
||||
return self._clean_text(text)
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""Clean up OCR output."""
|
||||
# Remove excessive whitespace
|
||||
text = re.sub(r'\n\s*\n', '\n', text)
|
||||
text = re.sub(r' +', ' ', text)
|
||||
return text.strip()
|
||||
def _clean_text(self, text: str, preserve_indentation: bool = True) -> str:
|
||||
"""
|
||||
Clean up OCR output.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text
|
||||
preserve_indentation: Keep leading whitespace on lines
|
||||
|
||||
Returns:
|
||||
Cleaned text
|
||||
"""
|
||||
if preserve_indentation:
|
||||
# Remove excessive blank lines but preserve indentation
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
# Keep line if it has content or is single empty line
|
||||
if line.strip() or (cleaned_lines and cleaned_lines[-1].strip()):
|
||||
cleaned_lines.append(line)
|
||||
return '\n'.join(cleaned_lines).strip()
|
||||
else:
|
||||
# Original aggressive cleaning
|
||||
text = re.sub(r'\n\s*\n', '\n', text)
|
||||
text = re.sub(r' +', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
def process_frames(
|
||||
self,
|
||||
@@ -108,18 +132,24 @@ class OCRProcessor:
|
||||
results = []
|
||||
prev_text = ""
|
||||
|
||||
for frame_path, timestamp in frames_info:
|
||||
logger.debug(f"Processing frame at {timestamp:.2f}s...")
|
||||
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
|
||||
logger.debug(f"Processing frame {idx}/{len(frames_info)} at {timestamp:.2f}s...")
|
||||
text = self.extract_text(frame_path)
|
||||
|
||||
if not text:
|
||||
logger.debug(f"No text extracted from frame at {timestamp:.2f}s")
|
||||
continue
|
||||
|
||||
# Debug: Show what was extracted
|
||||
logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars")
|
||||
logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}")
|
||||
|
||||
# Deduplicate similar consecutive frames
|
||||
if deduplicate:
|
||||
if deduplicate and prev_text:
|
||||
similarity = self._text_similarity(prev_text, text)
|
||||
logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})")
|
||||
if similarity > similarity_threshold:
|
||||
logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
|
||||
logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
|
||||
continue
|
||||
|
||||
results.append({
|
||||
|
||||
Reference in New Issue
Block a user