Skip to content

Commit 8a675a3

Browse files
committed
Compare the actual etags instead of comparing md5sums
The server side is currently using a shortcut in calculating the values of the `md5sum` field - it uses the Object Storage (S3) ETag value. ETag is a MD5. But for the multipart uploaded files, the MD5 is computed from the concatenation of the MD5s of each uploaded part. Say you uploaded a 14MB file and your part size is 5MB. Calculate 3 MD5 checksums corresponding to each part, i.e. the checksum of the first 5MB, the second 5MB, and the last 4MB. Then take the checksum of their concatenation. Since MD5 checksums are hex representations of binary data, just make sure you take the MD5 of the decoded binary concatenation, not of the ASCII or UTF-8 encoded concatenation. When that's done, add a hyphen and the number of parts to get the ETag.
1 parent 3d078a1 commit 8a675a3

File tree

2 files changed

+62
-12
lines changed

2 files changed

+62
-12
lines changed

qfieldcloud_sdk/sdk.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from requests.adapters import HTTPAdapter, Retry
1313

1414
from .interfaces import QfcException, QfcRequest, QfcRequestException
15-
from .utils import get_md5sum, log
15+
from .utils import calc_etag, log
1616

1717
logger = logging.getLogger(__file__)
1818

@@ -153,7 +153,11 @@ def list_remote_files(
153153
params["skip_metadata"] = "1"
154154

155155
resp = self._request("GET", f"files/{project_id}", params=params)
156-
return resp.json()
156+
remote_files = resp.json()
157+
# TODO remove this temporary decoration with `etag` key
158+
remote_files = list(map(lambda f: {"etag": f["md5sum"], **f}, remote_files))
159+
160+
return remote_files
157161

158162
def create_project(
159163
self,
@@ -217,8 +221,8 @@ def upload_files(
217221
remote_file = f
218222
break
219223

220-
md5sum = get_md5sum(local_file["absolute_filename"])
221-
if remote_file and remote_file.get("md5sum", None) == md5sum:
224+
etag = calc_etag(local_file["absolute_filename"])
225+
if remote_file and remote_file.get("etag", None) == etag:
222226
continue
223227

224228
files_to_upload.append(local_file)
@@ -537,9 +541,9 @@ def download_files(
537541

538542
for file in files_to_download:
539543
local_filename = Path(f'{local_dir}/{file["name"]}')
540-
md5sum = None
544+
etag = None
541545
if not force_download:
542-
md5sum = file.get("md5sum", None)
546+
etag = file.get("etag", None)
543547

544548
try:
545549
self.download_file(
@@ -548,7 +552,7 @@ def download_files(
548552
local_filename,
549553
file["name"],
550554
show_progress,
551-
md5sum,
555+
etag,
552556
)
553557
file["status"] = FileTransferStatus.SUCCESS
554558
except QfcRequestException as err:
@@ -575,7 +579,7 @@ def download_file(
575579
local_filename: Path,
576580
remote_filename: Path,
577581
show_progress: bool,
578-
remote_md5sum: str = None,
582+
remote_etag: str = None,
579583
) -> requests.Response:
580584
"""Download a single project file.
581585
@@ -585,7 +589,7 @@ def download_file(
585589
local_filename (Path): Local filename
586590
remote_filename (Path): Remote filename
587591
show_progress (bool): Show progressbar in the console
588-
remote_md5sum (str, optional): The md5sum of the remote file. If is None, the download of the file happens even if it already exists locally. Defaults to None.
592+
remote_etag (str, optional): The ETag of the remote file. If is None, the download of the file happens even if it already exists locally. Defaults to `None`.
589593
590594
Raises:
591595
NotImplementedError: Raised if unknown `download_type` is passed
@@ -594,8 +598,8 @@ def download_file(
594598
requests.Response: the response object
595599
"""
596600

597-
if remote_md5sum and local_filename.exists():
598-
if get_md5sum(str(local_filename)) == remote_md5sum:
601+
if remote_etag and local_filename.exists():
602+
if calc_etag(str(local_filename)) == remote_etag:
599603
if show_progress:
600604
print(
601605
f"{remote_filename}: Already present locally. Download skipped."

qfieldcloud_sdk/utils.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import hashlib
22
import json
3+
import os
34
import sys
45

56

@@ -14,7 +15,10 @@ def log(*msgs):
1415

1516

1617
def get_md5sum(filename: str) -> str:
17-
"""Calculate sha256sum of a file"""
18+
"""Calculate md5sum of a file.
19+
20+
Currently unused but will be revived in the upcoming versions.
21+
"""
1822
BLOCKSIZE = 65536
1923
hasher = hashlib.md5()
2024
with open(filename, "rb") as f:
@@ -23,3 +27,45 @@ def get_md5sum(filename: str) -> str:
2327
hasher.update(buf)
2428
buf = f.read(BLOCKSIZE)
2529
return hasher.hexdigest()
30+
31+
32+
def calc_etag(filename: str, part_size: int = 8 * 1024 * 1024) -> str:
33+
"""Calculate ETag as in Object Storage (S3) of a local file.
34+
35+
ETag is a MD5. But for the multipart uploaded files, the MD5 is computed from the concatenation of the MD5s of each uploaded part.
36+
37+
See the inspiration of this implementation here: https://stackoverflow.com/a/58239738/1226137
38+
39+
Args:
40+
filename (str): the local filename
41+
part_size (int): the size of the Object Storage part. Most Object Storages use 8MB. Defaults to 8*1024*1024.
42+
43+
Returns:
44+
str: the calculated ETag value
45+
"""
46+
with open(filename, "rb") as f:
47+
file_size = os.fstat(f.fileno()).st_size
48+
49+
if file_size <= part_size:
50+
BLOCKSIZE = 65536
51+
hasher = hashlib.md5()
52+
53+
buf = f.read(BLOCKSIZE)
54+
while len(buf) > 0:
55+
hasher.update(buf)
56+
buf = f.read(BLOCKSIZE)
57+
58+
return hasher.hexdigest()
59+
else:
60+
# Say you uploaded a 14MB file and your part size is 5MB.
61+
# Calculate 3 MD5 checksums corresponding to each part, i.e. the checksum of the first 5MB, the second 5MB, and the last 4MB.
62+
# Then take the checksum of their concatenation.
63+
# Since MD5 checksums are hex representations of binary data, just make sure you take the MD5 of the decoded binary concatenation, not of the ASCII or UTF-8 encoded concatenation.
64+
# When that's done, add a hyphen and the number of parts to get the ETag.
65+
md5sums = []
66+
for data in iter(lambda: f.read(part_size), b""):
67+
md5sums.append(hashlib.md5(data).digest())
68+
69+
final_md5sum = hashlib.md5(b"".join(md5sums))
70+
71+
return "{}-{}".format(final_md5sum.hexdigest(), len(md5sums))

0 commit comments

Comments
 (0)