#!/usr/bin/env python3 """ Test OCR stage end-to-end — sends real images to the inference server. Creates test images with known text, sends them through the /ocr endpoint, verifies the text comes back. Tests both the inference server and the ocr_stage module's remote path. Usage: python tests/detect/manual/test_ocr_e2e.py [--url URL] Requires: inference server running (gpu/server.py) """ import argparse import base64 import io import json import logging import sys import numpy as np import requests from PIL import Image, ImageDraw, ImageFont logging.basicConfig(level=logging.INFO, format="%(levelname)-7s %(name)s — %(message)s") logger = logging.getLogger(__name__) def make_text_image(text: str, width: int = 300, height: int = 80) -> np.ndarray: """Create a white image with black text for OCR testing.""" img = Image.new("RGB", (width, height), "white") draw = ImageDraw.Draw(img) try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 36) except (OSError, IOError): font = ImageFont.load_default() draw.text((10, 15), text, fill="black", font=font) return np.array(img) def image_to_b64(image: np.ndarray) -> str: img = Image.fromarray(image) buf = io.BytesIO() img.save(buf, "JPEG") return base64.b64encode(buf.getvalue()).decode() def test_health(url: str): logger.info("--- Health check ---") resp = requests.get(f"{url}/health") resp.raise_for_status() data = resp.json() logger.info("Status: %s, device: %s", data["status"], data["device"]) return True def test_ocr_endpoint(url: str, text: str): logger.info("--- OCR endpoint: '%s' ---", text) image = make_text_image(text) b64 = image_to_b64(image) resp = requests.post(f"{url}/ocr", json={"image": b64}) resp.raise_for_status() data = resp.json() results = data.get("results", []) logger.info("Results: %d text regions", len(results)) found = False for r in results: logger.info(" text=%r confidence=%.3f bbox=%s", r["text"], r["confidence"], r["bbox"]) if text.lower() in r["text"].lower(): found = True if found: logger.info("PASS — found '%s' in OCR output", text) else: logger.warning("MISS — '%s' not found (may be font/rendering issue, check results above)", text) return results def test_ocr_stage_remote(url: str): """Test the detect/stages/ocr_stage.py remote path.""" logger.info("--- OCR stage (remote mode) ---") sys.path.insert(0, ".") from detect.models import BoundingBox, Frame from detect.profiles.base import OCRConfig from detect.stages.ocr_stage import run_ocr # Create a frame with text baked in image = make_text_image("EMIRATES") frame = Frame(sequence=0, chunk_id=0, timestamp=1.0, image=image) box = BoundingBox(x=0, y=0, w=image.shape[1], h=image.shape[0], confidence=0.9, label="text") config = OCRConfig(languages=["en"], min_confidence=0.3) candidates = run_ocr( frames=[frame], boxes_by_frame={0: [box]}, config=config, inference_url=url, ) logger.info("Candidates: %d", len(candidates)) for c in candidates: logger.info(" text=%r confidence=%.3f", c.text, c.ocr_confidence) if candidates: logger.info("PASS — ocr_stage remote path returned results") else: logger.warning("MISS — no candidates returned (check inference server logs)") def main(): parser = argparse.ArgumentParser() parser.add_argument("--url", default="http://mcrndeb:8000") args = parser.parse_args() url = args.url.rstrip("/") logger.info("Inference server: %s", url) input("\nPress Enter to start...") test_health(url) test_ocr_endpoint(url, "NIKE") test_ocr_endpoint(url, "Coca-Cola") test_ocr_endpoint(url, "EMIRATES") test_ocr_stage_remote(url) logger.info("All OCR tests complete.") if __name__ == "__main__": main()