106 lines
3.0 KiB
Python
106 lines
3.0 KiB
Python
"""PaddleOCR 3.x text extraction wrapper."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
from models import registry
|
|
from config import get_config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _load(languages: list[str]):
|
|
from paddleocr import PaddleOCR
|
|
key = f"ocr_{'_'.join(languages)}"
|
|
model = PaddleOCR(lang=languages[0])
|
|
registry.put(key, model)
|
|
return model
|
|
|
|
|
|
def _get(languages: list[str] | None = None):
|
|
langs = languages or get_config()["ocr_languages"]
|
|
key = f"ocr_{'_'.join(langs)}"
|
|
model = registry.get(key)
|
|
if model is None:
|
|
model = _load(langs)
|
|
return model
|
|
|
|
|
|
def _parse_raw(raw) -> list[tuple[list, str, float]]:
|
|
"""
|
|
Parse PaddleOCR output into (points, text, confidence) tuples.
|
|
|
|
PaddleOCR 3.x changed the result format. Two known layouts:
|
|
|
|
Layout A — dict-based (new pipeline API):
|
|
raw = [{'rec_texts': [...], 'rec_scores': [...], 'dt_polys': [...]}]
|
|
|
|
Layout B — nested list (2.x compat / some 3.x builds):
|
|
raw = [[ [points, [text, score]], ... ]]
|
|
raw = [[ [points, [text, score], [cls, cls_score]], ... ]] # with angle cls
|
|
"""
|
|
results = []
|
|
|
|
for page in raw:
|
|
if not page:
|
|
continue
|
|
|
|
# Layout A: dict with parallel lists
|
|
if isinstance(page, dict):
|
|
texts = page.get("rec_texts", [])
|
|
scores = page.get("rec_scores", [])
|
|
polys = page.get("dt_polys", [])
|
|
for points, text, confidence in zip(polys, texts, scores):
|
|
results.append((points, text, float(confidence)))
|
|
continue
|
|
|
|
# Layout B: list of per-line entries
|
|
for line in page:
|
|
if not line:
|
|
continue
|
|
|
|
# line[0] is always the polygon points
|
|
points = line[0]
|
|
|
|
# line[1] is [text, score] — ignore any extra elements (angle cls etc.)
|
|
rec = line[1]
|
|
if isinstance(rec, (list, tuple)) and len(rec) >= 2:
|
|
text, confidence = rec[0], rec[1]
|
|
else:
|
|
logger.warning("Unexpected OCR line format: %s", line)
|
|
continue
|
|
|
|
results.append((points, str(text), float(confidence)))
|
|
|
|
return results
|
|
|
|
|
|
def ocr(image, languages: list[str] | None = None, min_confidence: float | None = None) -> list[dict]:
|
|
"""Run OCR on an image, return list of text result dicts."""
|
|
cfg = get_config()
|
|
min_conf = min_confidence if min_confidence is not None else cfg["ocr_min_confidence"]
|
|
model = _get(languages)
|
|
|
|
raw = model.ocr(image)
|
|
logger.debug("OCR raw: %s", raw)
|
|
|
|
parsed = _parse_raw(raw)
|
|
|
|
results = []
|
|
for points, text, confidence in parsed:
|
|
if confidence < min_conf:
|
|
continue
|
|
|
|
xs = [p[0] for p in points]
|
|
ys = [p[1] for p in points]
|
|
|
|
results.append({
|
|
"text": text,
|
|
"confidence": confidence,
|
|
"bbox": [int(min(xs)), int(min(ys)),
|
|
int(max(xs) - min(xs)), int(max(ys) - min(ys))],
|
|
})
|
|
|
|
return results
|