Syzygianinfern0 · DKandrew · Jul 21, 2023 · Jul 21, 2023 · Jul 21, 2023 · Jul 29, 2023
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,8 @@
 
 # Files
 download_links.txt
+download_commands.txt
+downloads/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -1,14 +1,33 @@
-# NuPlan-Download-CLI
+# NuScenes-Download-CLI
 
+This repo provides the scripts to download various datasets in [NuScenes](https://www.nuscenes.org/).
 
 ## Usage
 
+### NuPlan
+
 ```bash
 python download_nuplan.py --username <username> --password <password>
 wget -i download_links.txt
 ```
 
 The URLs should be valid for about five days. If you need to download the files again, just run the script again to generate the URLs. Godspeed with the terrabytes of downloads and good luck choking and hogging your entire team's bandwidth.
 
+### NuScenes
+
+```bash
+python download_nuscenes.py --username <username> --password <password>
+sh download_commands.txt
+```
+
+### Extract the data
+
+After you download the data, you can extract the download data using the following python script.
+
+```bash
+python extract_parallel.py
+```
+
 ## Why
-The NuScenes team for some reason keeps these links behind a convoluted authentation and token expiration system. And that makes downloading them super hard unless you want to keep a browser open for 5 days straight or use a CurlWget extension for 165 different links individually. This script automates that process by reverse engineering the authentation system and capturing the bearer tokens responsible for generating those temporary URLs for download. You can then download those files using wget very easily! Have fun <3
+
+The NuScenes team for some reason keeps these links behind a convoluted authentation and token expiration system. And that makes downloading them super hard unless you want to keep a browser open for 5 days straight or use a CurlWget extension for 165 different links individually. This script automates that process by reverse engineering the authentation system and capturing the bearer tokens responsible for generating those temporary URLs for download. You can then download those files using wget very easily! Have fun <3
diff --git a/download_nuplan.py b/download_nuplan.py
@@ -5,7 +5,9 @@
 from joblib import Parallel, delayed
 from tqdm import tqdm
 
-BASE_URL = "https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/nuplan-v1.1/"
+BASE_URL = (
+    "https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/nuplan-v1.1/"
+)
 
 
 def login(username, password):
@@ -66,28 +68,52 @@ def main():
 
         # nuPlan Test Set
         ## Lidars
-        links = [BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_lidar_{i}.zip" for i in range(12)]
+        links = [
+            BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_lidar_{i}.zip"
+            for i in range(12)
+        ]
         ## Cameras
-        links += [BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_camera_{i}.zip" for i in range(12)]
+        links += [
+            BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_camera_{i}.zip"
+            for i in range(12)
+        ]
         # nuPlan Train Set
         ## Lidars
-        links += [BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_lidar_{i}.zip" for i in range(43)]
+        links += [
+            BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_lidar_{i}.zip"
+            for i in range(43)
+        ]
         ## Cameras
-        links += [BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_camera_{i}.zip" for i in range(43)]
+        links += [
+            BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_camera_{i}.zip"
+            for i in range(43)
+        ]
         # nuScenes Val Set
         ## Lidars
-        links += [BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_lidar_{i}.zip" for i in range(12)]
+        links += [
+            BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_lidar_{i}.zip"
+            for i in range(12)
+        ]
         ## Cameras
-        links += [BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_camera_{i}.zip" for i in range(12)]
+        links += [
+            BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_camera_{i}.zip"
+            for i in range(12)
+        ]
         # Maps
         links += [BASE_URL + "nuplan-maps-v1.0.zip"]
         # Mini Split
         links += [BASE_URL + "nuplan-v1.1_mini.zip"]
         # Mini Sensors
         ## Lidars
-        links += [BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_lidar_{i}.zip" for i in range(9)]
+        links += [
+            BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_lidar_{i}.zip"
+            for i in range(9)
+        ]
         ## Cameras
-        links += [BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_camera_{i}.zip" for i in range(9)]
+        links += [
+            BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_camera_{i}.zip"
+            for i in range(9)
+        ]
         # Log DB Train Splits
         links += [
             BASE_URL + f"nuplan-v1.1_train_{city}.zip"
@@ -108,7 +134,9 @@ def main():
         # Log DB Test Splits
         links += [BASE_URL + "nuplan-v1.1_test.zip"]
 
-        download_links = Parallel(n_jobs=12)(delayed(get_download_url)(login_token, link) for link in tqdm(links))
+        download_links = Parallel(n_jobs=12)(
+            delayed(get_download_url)(login_token, link) for link in tqdm(links)
+        )
 
         # write download links to file
         with open("download_links.txt", "w") as f:

diff --git a/download_nuscenes.py b/download_nuscenes.py
@@ -0,0 +1,126 @@
+import argparse
+import json
+import requests
+import utils
+
+from joblib import Parallel, delayed
+from tqdm import tqdm
+
+
+def login(username, password):
+    headers = {
+        "content-type": "application/x-amz-json-1.1",
+        "x-amz-target": "AWSCognitoIdentityProviderService.InitiateAuth",
+    }
+
+    data = (
+        '{"AuthFlow":"USER_PASSWORD_AUTH","ClientId":"7fq5jvs5ffs1c50hd3toobb3b9","AuthParameters":{"USERNAME":"'
+        + username
+        + '","PASSWORD":"'
+        + password
+        + '"},"ClientMetadata":{}}'
+    )
+
+    response = requests.post(
+        "https://cognito-idp.us-east-1.amazonaws.com/",
+        headers=headers,
+        data=data,
+    )
+
+    token = json.loads(response.content)["AuthenticationResult"]["IdToken"]
+
+    return token
+
+
+def get_download_url(token, file_name):
+    # The URL prefix of the NuScenes dataset. Fetched from the request url.
+    BASE_URL = (
+        "https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/v1.0/"
+    )
+
+    headers = {
+        "authorization": "Bearer " + token,
+    }
+
+    # From the same source as BASE_URL.
+    params = {
+        "region": "us",
+        "project": "nuScenes",
+    }
+
+    url = BASE_URL + file_name
+    response = requests.get(
+        url,
+        params=params,
+        headers=headers,
+    )
+
+    download_url = json.loads(response.content)["url"]
+
+    return download_url
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download nuScenes dataset")
+    parser.add_argument("--username")
+    parser.add_argument("--password")
+    parser.add_argument(
+        "--data_output_dir", type=str, default=utils.get_default_data_output_dir()
+    )
+    args = parser.parse_args()
+
+    data_output_dir = args.data_output_dir
+    # requests session
+    with requests.Session() as s:
+        # login and get auth token
+        login_token = login(args.username, args.password)
+
+        # ====================
+        # NuScenes Trainval Set
+        # ====================
+        file_names = [f"v1.0-trainval{i:02d}_blobs.tgz" for i in range(1, 11)]
+
+        # Metadata
+        file_names += ["v1.0-trainval_meta.tgz"]
+
+        # ====================
+        # NuScenes Test Set
+        # ====================
+        file_names += ["v1.0-test_blobs.tgz"]
+
+        # Metadata
+        file_names += ["v1.0-test_meta.tgz"]
+
+        # ====================
+        # NuScenes Mini 
+        # ====================
+        file_names += ["v1.0-mini.tgz"]
+
+        # ====================
+        # Shared data
+        # ====================
+        # CAN bus expansion
+        file_names += ["can_bus.zip"]
+
+        # Map expansion (v1.3)
+        file_names += ["nuScenes-map-expansion-v1.3.zip"]
+
+        download_links = Parallel(n_jobs=12)(
+            delayed(get_download_url)(login_token, file_name)
+            for file_name in tqdm(file_names)
+        )
+
+        # write download commands to file
+        with open("download_commands.txt", "w") as f:
+            # a command to create the output directory if it does not exist.
+            f.write(f"mkdir -p {data_output_dir}\n")
+            # a command to enter the output directory.
+            f.write(f"cd {data_output_dir}\n")
+            # commands to download each file.
+            for i in range(len(download_links)):
+                command = f'wget -O {file_names[i]} "{download_links[i]}"\n'
+                f.write(command)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extract_parallel.py b/extract_parallel.py
@@ -1,21 +1,96 @@
+import argparse
 import os
-import subprocess
+import utils
+import zipfile
+import tarfile
 
+from typing import List
 from joblib import Parallel, delayed
+from tqdm import tqdm
 
-target_folder = "/net/acadia3a/data/datasets/nuplan"
-os.chdir(target_folder)
-os.system("ls")
 
+def get_all_files(folder_path) -> List[str]:
+    """
+    Returns a list of all the files in the folder_path. Folders in the folder_path will be ignored.
+    The returned files will have a path like "folder_path/file".
+    """
+    files = []
+    for file in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, file)
+        if os.path.isfile(file_path):
+            files.append(file_path)
 
-def extract_zip(zip_file):
-    os.system(f"tar -xf {zip_file}")
-    print(f"Extracted {zip_file}")
+    return files
 
 
-if __name__ == "__main__":
-    zip_files = ["nuplan-v1.1_train_lidar_{}.zip".format(i) for i in range(1, 44)]  # Assuming 44 zip files from 0 to 43
+def extract_zip(zip_file_path, output_dir):
+    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+        # Get the list of files to extract.
+        file_list = zip_ref.namelist()
+
+        # Create the output directory if it doesn't exist.
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Extract files with progress using tqdm.
+        for file in tqdm(
+            file_list,
+            desc=f"Extracting file {zip_file_path} to {output_dir}",
+            unit=" files",
+        ):
+            zip_ref.extract(file, output_dir)
+
+
+def extract_tar(tar_file_path, output_dir):
+    with tarfile.open(tar_file_path, "r") as tar_ref:
+        # Get the list of members (files/directories) to extract.
+        member_list = tar_ref.getmembers()
+
+        # Create the output directory if it doesn't exist.
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Extract members with progress using tqdm.
+        for member in tqdm(
+            member_list,
+            desc=f"Extracting file {tar_file_path} to {output_dir}",
+            unit=" members",
+        ):
+            tar_ref.extract(member, output_dir)
+
+
+def extract_compressed_files(file: str, output_dir: str):
+    """
+    Extracts a compressed file. If the file is not ended with ".tgz" or ".zip", report this error and do nothing.
+    """
+    file_extension = os.path.splitext(file)[-1]
 
-    # Extract the zip files in parallel
-    num_processes = 8  # Adjust this number based on your system's capabilities
-    Parallel(n_jobs=num_processes)(delayed(extract_zip)(zip_file) for zip_file in zip_files)
+    if file_extension == ".zip":
+        extract_zip(file, output_dir)
+    elif file_extension == ".tgz":
+        extract_tar(file, output_dir)
+    else:
+        print(f"Unsupported file {file}!")
+        return
+
+    print(f"File {file} extracted.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract compressed files.")
+    # Adjust the num_process based on your system's capabilities.
+    parser.add_argument("--num_process", type=int, default=8)
+    parser.add_argument(
+        "--data_output_dir", type=str, default=utils.get_default_data_output_dir()
+    )
+    args = parser.parse_args()
+
+    data_output_dir = args.data_output_dir
+    num_process = args.num_process
+    files = get_all_files(data_output_dir)
+
+    Parallel(n_jobs=num_process)(
+        delayed(extract_compressed_files)(file, data_output_dir) for file in files
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils.py b/utils.py
@@ -0,0 +1,2 @@
+def get_default_data_output_dir():
+    return "downloads"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		def get_default_data_output_dir():
		return "downloads"