"""Extract metadata from a single PDF in S3. Called once per PDF by the SFN Map state — input is one S3 key, output is that PDF's metadata. Designed for parallel invocation. Input event: {"key": "2026/04/document.pdf"} (bucket from BUCKET_NAME env var) {"bucket": "...", "key": "..."} (full override) Output: {"key": "...", "pages": N, "title": "..."|null, "author": "..."|null, "size_bytes": N} A failed parse (corrupt PDF, unsupported encryption) returns a row with pages=0 and an "error" field — the Map state continues with the rest; the bad PDF shows up later in the aggregate as a parse error count. """ import io import os import boto3 from pypdf import PdfReader from pypdf.errors import PdfReadError _s3 = boto3.client("s3", endpoint_url=os.environ.get("S3_ENDPOINT_URL") or None) def _clean(value) -> str | None: if value is None: return None s = str(value).strip() return s or None def handler(event, context): bucket = event.get("bucket") or os.environ["BUCKET_NAME"] key = event["key"] obj = _s3.get_object(Bucket=bucket, Key=key) body = obj["Body"].read() size_bytes = len(body) try: reader = PdfReader(io.BytesIO(body)) pages = len(reader.pages) info = reader.metadata or {} title = _clean(info.get("/Title")) author = _clean(info.get("/Author")) except (PdfReadError, Exception) as exc: return { "key": key, "pages": 0, "title": None, "author": None, "size_bytes": size_bytes, "error": f"{type(exc).__name__}: {exc}", } return { "key": key, "pages": pages, "title": title, "author": author, "size_bytes": size_bytes, }