Skip to content

Commit 441473d

Browse files
authored
sha256 for binaries index (#6241)
Restores the sha256 hash for binaries if they exist. In pytorch/pytorch#144887 the sha256 of the binary is manually calculated and uploaded as metadata with the binary because the automatic one given by s3 is wrong (likely due to it being part of a multipart upload https://docs.aws.amazon.com/AmazonS3/latest/userguide/tutorial-s3-mpu-additional-checksums.html). This PR * Attempts to adds a check for multipart upload sha256's and does not show them if they are part of a multipart upload and * Surfaces the new metadata given by the above PR * Restores the sha256 that was previously removed due to being wrong I also went back and corrected the bad shas from the multi part upload (not that it would have mattered, as this PR wouldn't show them in the index). The semantics of `copy_from` which is used in managed.py if `--compute-sha256` is used is correct and doesn't have the multi part upload issue
1 parent 6288b87 commit 441473d

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

s3_management/manage.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -382,14 +382,7 @@ def to_simple_package_html(
382382
out.append(' <body>')
383383
out.append(' <h1>Links for {}</h1>'.format(package_name.lower().replace("_", "-")))
384384
for obj in sorted(self.gen_file_list(subdir, package_name)):
385-
386385
maybe_fragment = f"#sha256={obj.checksum}" if obj.checksum else ""
387-
388-
# Temporary skip assigning sha256 to nightly index
389-
# to be reverted on Jan 24, 2025.
390-
if subdir is not None and "nightly" in subdir:
391-
maybe_fragment = ""
392-
393386
pep658_attribute = ""
394387
if obj.pep658:
395388
pep658_sha = f"sha256={obj.pep658}"
@@ -528,6 +521,7 @@ def fetch_object_names(cls: Type[S3IndexType], prefix: str) -> List[str]:
528521

529522
def fetch_metadata(self: S3IndexType) -> None:
530523
# Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible.
524+
regex_multipart_upload = r"^[A-Za-z0-9+/=]+=-[0-9]+$"
531525
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
532526
for idx, future in {
533527
idx: executor.submit(
@@ -540,10 +534,17 @@ def fetch_metadata(self: S3IndexType) -> None:
540534
if obj.size is None
541535
}.items():
542536
response = future.result()
543-
sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex()
537+
raw = response.get("ChecksumSHA256")
538+
if raw and match(regex_multipart_upload, raw):
539+
# Possibly part of a multipart upload, making the checksum incorrect
540+
print(f"WARNING: {self.objects[idx].orig_key} has bad checksum: {raw}")
541+
raw = None
542+
sha256 = raw and base64.b64decode(raw).hex()
544543
# For older files, rely on checksum-sha256 metadata that can be added to the file later
545544
if sha256 is None:
546545
sha256 = response.get("Metadata", {}).get("checksum-sha256")
546+
if sha256 is None:
547+
sha256 = response.get("Metadata", {}).get("x-amz-meta-checksum-sha256")
547548
self.objects[idx].checksum = sha256
548549
if size := response.get("ContentLength"):
549550
self.objects[idx].size = int(size)

0 commit comments

Comments
 (0)