From c063308954823ce6a300c2dff4487b2f765d8c9e Mon Sep 17 00:00:00 2001 From: Kyle Carter Date: Mon, 28 Sep 2020 16:41:52 -0600 Subject: [PATCH 1/6] Started looking into not pushing the same objects are already there. --- s3-pit-restore | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/s3-pit-restore b/s3-pit-restore index ae16d2a..e359716 100755 --- a/s3-pit-restore +++ b/s3-pit-restore @@ -184,10 +184,11 @@ def signal_handler(signal, frame): print("Gracefully exiting ...") def print_obj(obj, optional_message=""): - if args.verbose: - print('"%s" %s %s %s %s %s' % (obj["LastModified"], obj["VersionId"], obj["Size"], obj["StorageClass"], obj["Key"], optional_message)) - else: - print(obj["Key"]) + print("") + #if args.verbose: + #print('"%s" %s %s %s %s %s' % (obj["LastModified"], obj["VersionId"], obj["Size"], obj["StorageClass"], obj["Key"], optional_message)) + #else: + #print(obj["Key"]) def handled_by_glacier(obj): if obj["StorageClass"] == "GLACIER" and not args.enable_glacier: @@ -229,6 +230,7 @@ def handled_by_standard(obj): global futures futures[future] = obj else: + print("Handling via standard") print_obj(obj) except RuntimeError: return False @@ -236,6 +238,8 @@ def handled_by_standard(obj): def handled_by_copy(obj): if args.dry_run: + #print("handling via copy") + is_correct_version(obj) print_obj(obj) return True future = executor.submit(s3_copy_object, obj) @@ -253,6 +257,15 @@ def get_key(obj): return obj["Key"] return os.path.join(args.dest_prefix, obj["Key"]) +def is_correct_version(obj): + print(obj["Key"]) + print('What I was handed %s' % (obj["VersionId"])) + source_etag = obj["VersionId"] + response = client.head_object(Bucket=args.dest_bucket, Key=obj["Key"]) + print('What is currently there : %s' % (response["VersionId"])) + destination_etag = response["VersionId"] + print("Has changed? %s" % (source_etag != destination_etag)) + def s3_copy_object(obj): copy_source= { 'Bucket': args.bucket, From 4058cfb2c5ab4b7c499ee20ecb1f1e16662f1250 Mon Sep 17 00:00:00 2001 From: Kyle Carter Date: Tue, 29 Sep 2020 09:54:14 -0600 Subject: [PATCH 2/6] Skipped files that are the same on copy, likely need to put this behind a flag --- s3-pit-restore | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/s3-pit-restore b/s3-pit-restore index e359716..fa8a305 100755 --- a/s3-pit-restore +++ b/s3-pit-restore @@ -32,6 +32,7 @@ import os, sys, time, signal, argparse, boto3, botocore, \ from datetime import datetime, timezone from dateutil.parser import parse from s3transfer.manager import TransferConfig +from botocore.exceptions import ClientError args = None executor = None @@ -184,11 +185,10 @@ def signal_handler(signal, frame): print("Gracefully exiting ...") def print_obj(obj, optional_message=""): - print("") - #if args.verbose: - #print('"%s" %s %s %s %s %s' % (obj["LastModified"], obj["VersionId"], obj["Size"], obj["StorageClass"], obj["Key"], optional_message)) - #else: - #print(obj["Key"]) + if args.verbose: + print('"%s" %s %s %s %s %s' % (obj["LastModified"], obj["VersionId"], obj["Size"], obj["StorageClass"], obj["Key"], optional_message)) + else: + print(obj["Key"]) def handled_by_glacier(obj): if obj["StorageClass"] == "GLACIER" and not args.enable_glacier: @@ -230,16 +230,15 @@ def handled_by_standard(obj): global futures futures[future] = obj else: - print("Handling via standard") print_obj(obj) except RuntimeError: return False return True def handled_by_copy(obj): - if args.dry_run: - #print("handling via copy") - is_correct_version(obj) + if not needs_copy(obj): + return True + if args.dry_run: print_obj(obj) return True future = executor.submit(s3_copy_object, obj) @@ -247,6 +246,20 @@ def handled_by_copy(obj): futures[future] = obj return True +def needs_copy(obj): + try: + destination_object_data = client.head_object(Bucket=args.dest_bucket, Key=obj["Key"]) + except ClientError as error: + if error.response['ResponseMetadata']['HTTPStatusCode'] == 404: + return True + else: + raise error + # Won't work for files uploaded with different multipart chunk sizes + if args.bucket != args.dest_bucket: + return obj["ETag"] != destination_object_data["ETag"] + else: + return obj["VersionId"] != destination_object_data["VersionId"] + def download_file(obj): transfer.download_file(args.bucket, obj["Key"], obj["Key"], extra_args={"VersionId": obj["VersionId"]}) unixtime = time.mktime(obj["LastModified"].timetuple()) @@ -257,14 +270,6 @@ def get_key(obj): return obj["Key"] return os.path.join(args.dest_prefix, obj["Key"]) -def is_correct_version(obj): - print(obj["Key"]) - print('What I was handed %s' % (obj["VersionId"])) - source_etag = obj["VersionId"] - response = client.head_object(Bucket=args.dest_bucket, Key=obj["Key"]) - print('What is currently there : %s' % (response["VersionId"])) - destination_etag = response["VersionId"] - print("Has changed? %s" % (source_etag != destination_etag)) def s3_copy_object(obj): copy_source= { From 25eff3c5d5745d3f99255a12fdc572e0690672b5 Mon Sep 17 00:00:00 2001 From: Kyle Carter Date: Wed, 30 Sep 2020 15:44:10 -0600 Subject: [PATCH 3/6] Restored default behavior of making duplicates and put removal of duplicates behind command line flag --- s3-pit-restore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/s3-pit-restore b/s3-pit-restore index fa8a305..d9d6cbb 100755 --- a/s3-pit-restore +++ b/s3-pit-restore @@ -236,7 +236,7 @@ def handled_by_standard(obj): return True def handled_by_copy(obj): - if not needs_copy(obj): + if args.avoid_duplicates and not needs_copy(obj): return True if args.dry_run: print_obj(obj) @@ -411,6 +411,7 @@ if __name__=='__main__': parser.add_argument('--debug', help='enable debug output', action='store_true') parser.add_argument('--test', help='s3 pit restore testing', action='store_true') parser.add_argument('--max-workers', help='max number of concurrent download requests', default=10, type=int) + parser.add_argument('--avoid-duplicates', help='avoids copying files if the latest version is the version that matches timestamp requested', action='store_true') args = parser.parse_args() if args.dest_bucket is None and not args.dest: From 5300b51ccc61092e8c2bd47c8e2477403326ce33 Mon Sep 17 00:00:00 2001 From: Kyle Carter Date: Fri, 2 Oct 2020 10:12:44 -0600 Subject: [PATCH 4/6] Added some documentation about new flags --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0dddf3a..8ec1afb 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ usage: s3-pit-restore [-h] -b BUCKET [-B DEST_BUCKET] [-d DEST] [-P DEST_PREFIX] [-p PREFIX] [-t TIMESTAMP] [-f FROM_TIMESTAMP] [-e] [-v] [--dry-run] [--debug] [--test] [--max-workers MAX_WORKERS] + [--avoid-duplicates] optional arguments: -h, --help show this help message and exit @@ -111,6 +112,7 @@ optional arguments: --test s3 pit restore testing --max-workers MAX_WORKERS max number of concurrent download requests + --avoid-duplicates tries to avoid sopying files that are already at the latest version ``` ## Docker Usage From 0d11de7ebb51f6b876879b17c777466b9bdb130d Mon Sep 17 00:00:00 2001 From: Kyle Carter Date: Fri, 2 Oct 2020 10:22:08 -0600 Subject: [PATCH 5/6] Fixed tabs to spaces --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8ec1afb..3d0c2ac 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ usage: s3-pit-restore [-h] -b BUCKET [-B DEST_BUCKET] [-d DEST] [-P DEST_PREFIX] [-p PREFIX] [-t TIMESTAMP] [-f FROM_TIMESTAMP] [-e] [-v] [--dry-run] [--debug] [--test] [--max-workers MAX_WORKERS] - [--avoid-duplicates] + [--avoid-duplicates] optional arguments: -h, --help show this help message and exit @@ -112,7 +112,7 @@ optional arguments: --test s3 pit restore testing --max-workers MAX_WORKERS max number of concurrent download requests - --avoid-duplicates tries to avoid sopying files that are already at the latest version + --avoid-duplicates tries to avoid sopying files that are already at the latest version ``` ## Docker Usage From 83c7631305637bdc72dc67293cb1046c23216135 Mon Sep 17 00:00:00 2001 From: Kyle Carter Date: Fri, 9 Oct 2020 14:44:47 -0600 Subject: [PATCH 6/6] Fixed whitespace issue and typo --- README.md | 2 +- s3-pit-restore | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 3d0c2ac..aff715b 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ optional arguments: --test s3 pit restore testing --max-workers MAX_WORKERS max number of concurrent download requests - --avoid-duplicates tries to avoid sopying files that are already at the latest version + --avoid-duplicates tries to avoid copying files that are already at the latest version ``` ## Docker Usage diff --git a/s3-pit-restore b/s3-pit-restore index d9d6cbb..1bc47fc 100755 --- a/s3-pit-restore +++ b/s3-pit-restore @@ -270,7 +270,6 @@ def get_key(obj): return obj["Key"] return os.path.join(args.dest_prefix, obj["Key"]) - def s3_copy_object(obj): copy_source= { 'Bucket': args.bucket,