import os import sys import boto3 from botocore.client import Config from botocore.exceptions import ClientError BUCKET = os.environ.get("BUCKET_NAME", "my-company-reports-bucket") PREFIX = os.environ.get("PREFIX", "2026/04/") ENDPOINT = os.environ.get("S3_ENDPOINT_URL", "http://localhost:9000") DECOY_EXTS = (".txt", ".csv", ".json") def _client(): return boto3.client( "s3", endpoint_url=ENDPOINT, aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", "minioadmin"), aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", "minioadmin"), region_name=os.environ.get("AWS_REGION", "us-east-1"), config=Config(signature_version="s3v4"), ) def _ensure_bucket(s3, name): try: s3.head_bucket(Bucket=name) except ClientError: s3.create_bucket(Bucket=name) def _walk(source_dir): for root, _, files in os.walk(source_dir): for name in files: yield os.path.join(root, name) def main(): source_dir = sys.argv[1] if len(sys.argv) > 1 else os.environ.get("SOURCE_DIR") if not source_dir: print("usage: SOURCE_DIR= python seed.py (or pass as argv[1])", file=sys.stderr) sys.exit(2) if not os.path.isdir(source_dir): print(f"not a directory: {source_dir}", file=sys.stderr) sys.exit(2) s3 = _client() _ensure_bucket(s3, BUCKET) pdf_n = decoy_n = 0 for path in _walk(source_dir): lower = path.lower() is_pdf = lower.endswith(".pdf") is_decoy = lower.endswith(DECOY_EXTS) if not (is_pdf or is_decoy): continue rel = os.path.relpath(path, source_dir).replace(os.sep, "/") key = f"{PREFIX}{rel}" try: s3.upload_file(path, BUCKET, key) except (ClientError, OSError) as exc: print(f" skip {path}: {exc}", file=sys.stderr) continue if is_pdf: pdf_n += 1 else: decoy_n += 1 if (pdf_n + decoy_n) % 100 == 0: print(f" uploaded {pdf_n} pdfs / {decoy_n} decoys ...") print(f"done: {pdf_n} pdfs and {decoy_n} decoys uploaded to s3://{BUCKET}/{PREFIX}") if __name__ == "__main__": main()