77 lines
2.2 KiB
Python
77 lines
2.2 KiB
Python
import os
|
|
import sys
|
|
|
|
import boto3
|
|
from botocore.client import Config
|
|
from botocore.exceptions import ClientError
|
|
|
|
BUCKET = os.environ.get("BUCKET_NAME", "my-company-reports-bucket")
|
|
PREFIX = os.environ.get("PREFIX", "2026/04/")
|
|
ENDPOINT = os.environ.get("S3_ENDPOINT_URL", "http://localhost:9000")
|
|
DECOY_EXTS = ()
|
|
|
|
|
|
def _client():
|
|
return boto3.client(
|
|
"s3",
|
|
endpoint_url=ENDPOINT,
|
|
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", "minioadmin"),
|
|
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", "minioadmin"),
|
|
region_name=os.environ.get("AWS_REGION", "us-east-1"),
|
|
config=Config(signature_version="s3v4"),
|
|
)
|
|
|
|
|
|
def _ensure_bucket(s3, name):
|
|
try:
|
|
s3.head_bucket(Bucket=name)
|
|
except ClientError:
|
|
s3.create_bucket(Bucket=name)
|
|
|
|
|
|
def _walk(source_dir):
|
|
for root, _, files in os.walk(source_dir):
|
|
for name in files:
|
|
yield os.path.join(root, name)
|
|
|
|
|
|
def main():
|
|
source_dir = sys.argv[1] if len(sys.argv) > 1 else os.environ.get("SOURCE_DIR")
|
|
if not source_dir:
|
|
print("usage: SOURCE_DIR=<path> python seed.py (or pass as argv[1])", file=sys.stderr)
|
|
sys.exit(2)
|
|
if not os.path.isdir(source_dir):
|
|
print(f"not a directory: {source_dir}", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
s3 = _client()
|
|
_ensure_bucket(s3, BUCKET)
|
|
|
|
pdf_n = decoy_n = 0
|
|
for path in _walk(source_dir):
|
|
lower = path.lower()
|
|
is_pdf = lower.endswith(".pdf")
|
|
is_decoy = lower.endswith(DECOY_EXTS)
|
|
if not (is_pdf or is_decoy):
|
|
continue
|
|
|
|
rel = os.path.relpath(path, source_dir).replace(os.sep, "/")
|
|
key = f"{PREFIX}{rel}"
|
|
try:
|
|
s3.upload_file(path, BUCKET, key)
|
|
except (ClientError, OSError) as exc:
|
|
print(f" skip {path}: {exc}", file=sys.stderr)
|
|
continue
|
|
if is_pdf:
|
|
pdf_n += 1
|
|
else:
|
|
decoy_n += 1
|
|
if (pdf_n + decoy_n) % 100 == 0:
|
|
print(f" uploaded {pdf_n} pdfs / {decoy_n} decoys ...")
|
|
|
|
print(f"done: {pdf_n} pdfs and {decoy_n} decoys uploaded to s3://{BUCKET}/{PREFIX}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|