33 lines
972 B
Python
33 lines
972 B
Python
"""List every .pdf key under the configured bucket+prefix.
|
|
|
|
Output:
|
|
{"keys": ["2026/04/a.pdf", ...], "count": N, "pages": N}
|
|
|
|
Used as the first state in the metadata-index pipeline. The "keys" array
|
|
feeds an SFN Map state that runs ExtractMetadata in parallel per key.
|
|
"""
|
|
import os
|
|
|
|
import boto3
|
|
|
|
_s3 = boto3.client("s3", endpoint_url=os.environ.get("S3_ENDPOINT_URL") or None)
|
|
|
|
|
|
def handler(event, context):
|
|
bucket = event.get("bucket") or os.environ["BUCKET_NAME"]
|
|
prefix = event.get("prefix") or os.environ["PREFIX"]
|
|
|
|
keys = []
|
|
pages = 0
|
|
paginator = _s3.get_paginator("list_objects_v2")
|
|
for page in paginator.paginate(
|
|
Bucket=bucket, Prefix=prefix, PaginationConfig={"PageSize": 1000}
|
|
):
|
|
pages += 1
|
|
for obj in page.get("Contents", []) or []:
|
|
key = obj["Key"]
|
|
if key.lower().endswith(".pdf"):
|
|
keys.append(key)
|
|
|
|
return {"keys": keys, "count": len(keys), "pages": pages}
|