diff --git a/README.md b/README.md index 3f05d18..7a6c9d2 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,7 @@ usage: s3-pit-restore [-h] -b BUCKET [-B DEST_BUCKET] [-d DEST] [-P DEST_PREFIX] [-p PREFIX] [-t TIMESTAMP] [-f FROM_TIMESTAMP] [-e] [-v] [--dry-run] [--debug] [--test] [--max-workers MAX_WORKERS] + [--avoid-duplicates] [--sse {AES256,aws:kms}] optional arguments: @@ -129,6 +130,7 @@ optional arguments: --test s3 pit restore testing --max-workers MAX_WORKERS max number of concurrent download requests + --avoid-duplicates tries to avoid copying files that are already at the latest version --sse ALGORITHM specify what SSE algorithm you would like to use for the copy ``` diff --git a/s3-pit-restore b/s3-pit-restore index 58b5062..355fc38 100755 --- a/s3-pit-restore +++ b/s3-pit-restore @@ -34,6 +34,7 @@ import os, sys, time, signal, argparse, boto3, botocore, \ from datetime import datetime, timezone from dateutil.parser import parse from s3transfer.manager import TransferConfig +from botocore.exceptions import ClientError args = None executor = None @@ -237,6 +238,8 @@ def handled_by_standard(obj): return True def handled_by_copy(obj): + if args.avoid_duplicates and not needs_copy(obj): + return True if args.dry_run: print_obj(obj) return True @@ -245,6 +248,20 @@ def handled_by_copy(obj): futures[future] = obj return True +def needs_copy(obj): + try: + destination_object_data = client.head_object(Bucket=args.dest_bucket, Key=obj["Key"]) + except ClientError as error: + if error.response['ResponseMetadata']['HTTPStatusCode'] == 404: + return True + else: + raise error + # Won't work for files uploaded with different multipart chunk sizes + if args.bucket != args.dest_bucket: + return obj["ETag"] != destination_object_data["ETag"] + else: + return obj["VersionId"] != destination_object_data["VersionId"] + def download_file(obj): transfer.download_file(args.bucket, obj["Key"], obj["Key"], extra_args={"VersionId": obj["VersionId"]}) unixtime = time.mktime(obj["LastModified"].timetuple()) @@ -401,6 +418,7 @@ if __name__=='__main__': parser.add_argument('--debug', help='enable debug output', action='store_true') parser.add_argument('--test', help='s3 pit restore testing', action='store_true') parser.add_argument('--max-workers', help='max number of concurrent download requests', default=10, type=int) + parser.add_argument('--avoid-duplicates', help='avoids copying files if the latest version is the version that matches timestamp requested', action='store_true') parser.add_argument('--sse', choices=['AES256', 'aws:kms'], help='Specify server-side encryption') args = parser.parse_args()