metadata pipeline via Step Functions
This commit is contained in:
65
functions/extract_metadata/handler.py
Normal file
65
functions/extract_metadata/handler.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Extract metadata from a single PDF in S3.
|
||||
|
||||
Called once per PDF by the SFN Map state — input is one S3 key, output is
|
||||
that PDF's metadata. Designed for parallel invocation.
|
||||
|
||||
Input event:
|
||||
{"key": "2026/04/document.pdf"} (bucket from BUCKET_NAME env var)
|
||||
{"bucket": "...", "key": "..."} (full override)
|
||||
|
||||
Output:
|
||||
{"key": "...", "pages": N, "title": "..."|null, "author": "..."|null,
|
||||
"size_bytes": N}
|
||||
|
||||
A failed parse (corrupt PDF, unsupported encryption) returns a row with
|
||||
pages=0 and an "error" field — the Map state continues with the rest;
|
||||
the bad PDF shows up later in the aggregate as a parse error count.
|
||||
"""
|
||||
import io
|
||||
import os
|
||||
|
||||
import boto3
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfReadError
|
||||
|
||||
_s3 = boto3.client("s3", endpoint_url=os.environ.get("S3_ENDPOINT_URL") or None)
|
||||
|
||||
|
||||
def _clean(value) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
s = str(value).strip()
|
||||
return s or None
|
||||
|
||||
|
||||
def handler(event, context):
|
||||
bucket = event.get("bucket") or os.environ["BUCKET_NAME"]
|
||||
key = event["key"]
|
||||
|
||||
obj = _s3.get_object(Bucket=bucket, Key=key)
|
||||
body = obj["Body"].read()
|
||||
size_bytes = len(body)
|
||||
|
||||
try:
|
||||
reader = PdfReader(io.BytesIO(body))
|
||||
pages = len(reader.pages)
|
||||
info = reader.metadata or {}
|
||||
title = _clean(info.get("/Title"))
|
||||
author = _clean(info.get("/Author"))
|
||||
except (PdfReadError, Exception) as exc:
|
||||
return {
|
||||
"key": key,
|
||||
"pages": 0,
|
||||
"title": None,
|
||||
"author": None,
|
||||
"size_bytes": size_bytes,
|
||||
"error": f"{type(exc).__name__}: {exc}",
|
||||
}
|
||||
|
||||
return {
|
||||
"key": key,
|
||||
"pages": pages,
|
||||
"title": title,
|
||||
"author": author,
|
||||
"size_bytes": size_bytes,
|
||||
}
|
||||
3
functions/extract_metadata/requirements.txt
Normal file
3
functions/extract_metadata/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
# pypdf is pure Python — small footprint, no native wheels needed for arm64.
|
||||
# boto3 is already in the Lambda Python runtime, no need to bundle.
|
||||
pypdf>=5.0
|
||||
32
functions/list_pdfs/handler.py
Normal file
32
functions/list_pdfs/handler.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""List every .pdf key under the configured bucket+prefix.
|
||||
|
||||
Output:
|
||||
{"keys": ["2026/04/a.pdf", ...], "count": N, "pages": N}
|
||||
|
||||
Used as the first state in the metadata-index pipeline. The "keys" array
|
||||
feeds an SFN Map state that runs ExtractMetadata in parallel per key.
|
||||
"""
|
||||
import os
|
||||
|
||||
import boto3
|
||||
|
||||
_s3 = boto3.client("s3", endpoint_url=os.environ.get("S3_ENDPOINT_URL") or None)
|
||||
|
||||
|
||||
def handler(event, context):
|
||||
bucket = event.get("bucket") or os.environ["BUCKET_NAME"]
|
||||
prefix = event.get("prefix") or os.environ["PREFIX"]
|
||||
|
||||
keys = []
|
||||
pages = 0
|
||||
paginator = _s3.get_paginator("list_objects_v2")
|
||||
for page in paginator.paginate(
|
||||
Bucket=bucket, Prefix=prefix, PaginationConfig={"PageSize": 1000}
|
||||
):
|
||||
pages += 1
|
||||
for obj in page.get("Contents", []) or []:
|
||||
key = obj["Key"]
|
||||
if key.lower().endswith(".pdf"):
|
||||
keys.append(key)
|
||||
|
||||
return {"keys": keys, "count": len(keys), "pages": pages}
|
||||
1
functions/list_pdfs/requirements.txt
Normal file
1
functions/list_pdfs/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
# boto3 is provided by the Lambda Python runtime — no deps to bundle.
|
||||
Reference in New Issue
Block a user