129 lines
3.9 KiB
Python
129 lines
3.9 KiB
Python
"""
|
|
Stage 3.5 — Preprocessing
|
|
|
|
Runs between YOLO detection and OCR. Applies configurable image
|
|
preprocessing to each detected region crop: contrast enhancement,
|
|
deskewing, binarization.
|
|
|
|
Operates on the crops derived from boxes_by_frame, produces
|
|
preprocessed_crops keyed by (frame_sequence, box_index).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
import numpy as np
|
|
|
|
from detect import emit
|
|
from detect.models import BoundingBox, Frame
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _crop_region(frame: Frame, box: BoundingBox) -> np.ndarray:
|
|
h, w = frame.image.shape[:2]
|
|
x1 = max(0, box.x)
|
|
y1 = max(0, box.y)
|
|
x2 = min(w, box.x + box.w)
|
|
y2 = min(h, box.y + box.h)
|
|
return frame.image[y1:y2, x1:x2]
|
|
|
|
|
|
def preprocess_regions(
|
|
frames: list[Frame],
|
|
boxes_by_frame: dict[int, list[BoundingBox]],
|
|
do_contrast: bool = True,
|
|
do_deskew: bool = False,
|
|
do_binarize: bool = False,
|
|
inference_url: str | None = None,
|
|
job_id: str | None = None,
|
|
) -> dict[str, np.ndarray]:
|
|
"""
|
|
Preprocess cropped regions from YOLO detections.
|
|
|
|
Returns dict keyed by "{frame_seq}_{box_idx}" → preprocessed crop.
|
|
These are passed to the OCR stage instead of raw crops.
|
|
"""
|
|
total_regions = sum(len(boxes) for boxes in boxes_by_frame.values())
|
|
any_active = do_contrast or do_deskew or do_binarize
|
|
|
|
if not any_active:
|
|
emit.log(job_id, "Preprocess", "INFO",
|
|
f"Preprocessing disabled, passing {total_regions} regions through")
|
|
return {}
|
|
|
|
mode = "remote" if inference_url else "local"
|
|
emit.log(job_id, "Preprocess", "INFO",
|
|
f"Preprocessing {total_regions} regions (mode={mode}, "
|
|
f"contrast={do_contrast}, deskew={do_deskew}, binarize={do_binarize})")
|
|
|
|
frame_map = {f.sequence: f for f in frames}
|
|
preprocessed: dict[str, np.ndarray] = {}
|
|
processed_count = 0
|
|
|
|
for seq, boxes in boxes_by_frame.items():
|
|
frame = frame_map.get(seq)
|
|
if not frame:
|
|
continue
|
|
|
|
for idx, box in enumerate(boxes):
|
|
crop = _crop_region(frame, box)
|
|
if crop.size == 0:
|
|
continue
|
|
|
|
key = f"{seq}_{idx}"
|
|
|
|
if inference_url:
|
|
result = _preprocess_remote(crop, inference_url,
|
|
do_contrast, do_deskew, do_binarize)
|
|
else:
|
|
result = _preprocess_local(crop, do_contrast, do_deskew, do_binarize)
|
|
|
|
preprocessed[key] = result
|
|
processed_count += 1
|
|
|
|
emit.log(job_id, "Preprocess", "INFO",
|
|
f"Preprocessed {processed_count} regions")
|
|
|
|
return preprocessed
|
|
|
|
|
|
def _preprocess_remote(crop: np.ndarray, inference_url: str,
|
|
do_contrast: bool, do_deskew: bool, do_binarize: bool) -> np.ndarray:
|
|
"""Call GPU server /preprocess endpoint."""
|
|
import base64
|
|
import io
|
|
|
|
import requests
|
|
from PIL import Image
|
|
|
|
img = Image.fromarray(crop)
|
|
buf = io.BytesIO()
|
|
img.save(buf, format="JPEG", quality=85)
|
|
image_b64 = base64.b64encode(buf.getvalue()).decode()
|
|
|
|
resp = requests.post(
|
|
f"{inference_url.rstrip('/')}/preprocess",
|
|
json={
|
|
"image": image_b64,
|
|
"contrast": do_contrast,
|
|
"deskew": do_deskew,
|
|
"binarize": do_binarize,
|
|
},
|
|
timeout=30,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
result_bytes = base64.b64decode(data["image"])
|
|
result_img = Image.open(io.BytesIO(result_bytes)).convert("RGB")
|
|
return np.array(result_img)
|
|
|
|
|
|
def _preprocess_local(crop: np.ndarray,
|
|
do_contrast: bool, do_deskew: bool, do_binarize: bool) -> np.ndarray:
|
|
"""Run preprocessing in-process (requires opencv-python-headless)."""
|
|
from gpu.models.preprocess import preprocess
|
|
return preprocess(crop, do_binarize=do_binarize, do_deskew=do_deskew, do_contrast=do_contrast)
|