This commit is contained in:
2026-03-26 06:10:19 -03:00
parent 731964ca10
commit e27cb5bcc3
41 changed files with 2079 additions and 95 deletions

117
gpu/models/preprocess.py Normal file
View File

@@ -0,0 +1,117 @@
"""
Image preprocessing pipeline for crops before OCR.
Each step is independently toggleable via config.
Operates on numpy arrays (BGR or RGB), returns processed array.
"""
from __future__ import annotations
import logging
import numpy as np
logger = logging.getLogger(__name__)
def binarize(image: np.ndarray, threshold: int = 128) -> np.ndarray:
"""Convert to grayscale and apply Otsu binarization."""
import cv2
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
else:
gray = image
_, binary = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Convert back to 3-channel for downstream compatibility
result = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
return result
def deskew(image: np.ndarray) -> np.ndarray:
"""Correct slight rotation using minimum area rectangle."""
import cv2
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
else:
gray = image
coords = np.column_stack(np.where(gray < 128))
if len(coords) < 10:
return image
rect = cv2.minAreaRect(coords)
angle = rect[-1]
# Normalize angle
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
if abs(angle) < 0.5:
return image
h, w = image.shape[:2]
center = (w // 2, h // 2)
rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
result = cv2.warpAffine(
image, rotation_matrix, (w, h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE,
)
return result
def enhance_contrast(image: np.ndarray) -> np.ndarray:
"""Apply CLAHE (adaptive histogram equalization) for contrast normalization."""
import cv2
if len(image.shape) == 3:
lab = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)
l_channel = lab[:, :, 0]
else:
l_channel = image
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(l_channel)
if len(image.shape) == 3:
lab[:, :, 0] = enhanced
result = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
else:
result = enhanced
return result
def preprocess(
image: np.ndarray,
do_binarize: bool = False,
do_deskew: bool = False,
do_contrast: bool = True,
) -> np.ndarray:
"""
Run the preprocessing pipeline on a crop image.
Each step is independently toggleable. Order: contrast → deskew → binarize.
Contrast first (works best on color), binarize last (destroys color info).
"""
result = image
if do_contrast:
result = enhance_contrast(result)
logger.debug("Preprocessing: contrast enhanced")
if do_deskew:
result = deskew(result)
logger.debug("Preprocessing: deskewed")
if do_binarize:
result = binarize(result)
logger.debug("Preprocessing: binarized")
return result

View File

@@ -25,3 +25,6 @@ paddleocr>=3.0.0
# (all_tied_weights_keys API change). Also needs accelerate for device_map.
transformers>=4.40.0,<5
accelerate>=0.27.0
# Preprocessing (phase 12)
opencv-python-headless>=4.8.0

View File

@@ -73,6 +73,17 @@ class OCRResponse(BaseModel):
results: list[OCRTextResult]
class PreprocessRequest(BaseModel):
image: str
binarize: bool = False
deskew: bool = False
contrast: bool = True
class PreprocessResponse(BaseModel):
image: str # base64 JPEG of processed image
class VLMRequest(BaseModel):
image: str
prompt: str
@@ -183,6 +194,34 @@ def ocr(req: OCRRequest):
return OCRResponse(results=[OCRTextResult(**r) for r in results])
@app.post("/preprocess", response_model=PreprocessResponse)
def preprocess_image(req: PreprocessRequest):
try:
image = _decode_image(req.image)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Bad image: {e}")
try:
from models.preprocess import preprocess
processed = preprocess(
image,
do_binarize=req.binarize,
do_deskew=req.deskew,
do_contrast=req.contrast,
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Preprocessing failed: {e}")
from PIL import Image as PILImage
import io
img = PILImage.fromarray(processed)
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=90)
result_b64 = base64.b64encode(buf.getvalue()).decode()
return PreprocessResponse(image=result_b64)
@app.post("/vlm", response_model=VLMResponse)
def vlm(req: VLMRequest):
try: