diff --git a/.gitignore b/.gitignore index f15588f..5928d27 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ # Files download_links.txt +download_commands.txt +downloads/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index b520869..8415f32 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@ -# NuPlan-Download-CLI +# NuScenes-Download-CLI +This repo provides the scripts to download various datasets in [NuScenes](https://www.nuscenes.org/). ## Usage +### NuPlan + ```bash python download_nuplan.py --username --password wget -i download_links.txt @@ -10,5 +13,21 @@ wget -i download_links.txt The URLs should be valid for about five days. If you need to download the files again, just run the script again to generate the URLs. Godspeed with the terrabytes of downloads and good luck choking and hogging your entire team's bandwidth. +### NuScenes + +```bash +python download_nuscenes.py --username --password +sh download_commands.txt +``` + +### Extract the data + +After you download the data, you can extract the download data using the following python script. + +```bash +python extract_parallel.py +``` + ## Why -The NuScenes team for some reason keeps these links behind a convoluted authentation and token expiration system. And that makes downloading them super hard unless you want to keep a browser open for 5 days straight or use a CurlWget extension for 165 different links individually. This script automates that process by reverse engineering the authentation system and capturing the bearer tokens responsible for generating those temporary URLs for download. You can then download those files using wget very easily! Have fun <3 + +The NuScenes team for some reason keeps these links behind a convoluted authentation and token expiration system. And that makes downloading them super hard unless you want to keep a browser open for 5 days straight or use a CurlWget extension for 165 different links individually. This script automates that process by reverse engineering the authentation system and capturing the bearer tokens responsible for generating those temporary URLs for download. You can then download those files using wget very easily! Have fun <3 \ No newline at end of file diff --git a/download_nuplan.py b/download_nuplan.py index 7de2b54..bb5dc14 100644 --- a/download_nuplan.py +++ b/download_nuplan.py @@ -5,7 +5,9 @@ from joblib import Parallel, delayed from tqdm import tqdm -BASE_URL = "https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/nuplan-v1.1/" +BASE_URL = ( + "https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/nuplan-v1.1/" +) def login(username, password): @@ -66,28 +68,52 @@ def main(): # nuPlan Test Set ## Lidars - links = [BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_lidar_{i}.zip" for i in range(12)] + links = [ + BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_lidar_{i}.zip" + for i in range(12) + ] ## Cameras - links += [BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_camera_{i}.zip" for i in range(12)] + links += [ + BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_camera_{i}.zip" + for i in range(12) + ] # nuPlan Train Set ## Lidars - links += [BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_lidar_{i}.zip" for i in range(43)] + links += [ + BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_lidar_{i}.zip" + for i in range(43) + ] ## Cameras - links += [BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_camera_{i}.zip" for i in range(43)] + links += [ + BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_camera_{i}.zip" + for i in range(43) + ] # nuScenes Val Set ## Lidars - links += [BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_lidar_{i}.zip" for i in range(12)] + links += [ + BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_lidar_{i}.zip" + for i in range(12) + ] ## Cameras - links += [BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_camera_{i}.zip" for i in range(12)] + links += [ + BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_camera_{i}.zip" + for i in range(12) + ] # Maps links += [BASE_URL + "nuplan-maps-v1.0.zip"] # Mini Split links += [BASE_URL + "nuplan-v1.1_mini.zip"] # Mini Sensors ## Lidars - links += [BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_lidar_{i}.zip" for i in range(9)] + links += [ + BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_lidar_{i}.zip" + for i in range(9) + ] ## Cameras - links += [BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_camera_{i}.zip" for i in range(9)] + links += [ + BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_camera_{i}.zip" + for i in range(9) + ] # Log DB Train Splits links += [ BASE_URL + f"nuplan-v1.1_train_{city}.zip" @@ -108,7 +134,9 @@ def main(): # Log DB Test Splits links += [BASE_URL + "nuplan-v1.1_test.zip"] - download_links = Parallel(n_jobs=12)(delayed(get_download_url)(login_token, link) for link in tqdm(links)) + download_links = Parallel(n_jobs=12)( + delayed(get_download_url)(login_token, link) for link in tqdm(links) + ) # write download links to file with open("download_links.txt", "w") as f: diff --git a/download_nuscenes.py b/download_nuscenes.py new file mode 100644 index 0000000..4723a91 --- /dev/null +++ b/download_nuscenes.py @@ -0,0 +1,126 @@ +import argparse +import json +import requests +import utils + +from joblib import Parallel, delayed +from tqdm import tqdm + + +def login(username, password): + headers = { + "content-type": "application/x-amz-json-1.1", + "x-amz-target": "AWSCognitoIdentityProviderService.InitiateAuth", + } + + data = ( + '{"AuthFlow":"USER_PASSWORD_AUTH","ClientId":"7fq5jvs5ffs1c50hd3toobb3b9","AuthParameters":{"USERNAME":"' + + username + + '","PASSWORD":"' + + password + + '"},"ClientMetadata":{}}' + ) + + response = requests.post( + "https://cognito-idp.us-east-1.amazonaws.com/", + headers=headers, + data=data, + ) + + token = json.loads(response.content)["AuthenticationResult"]["IdToken"] + + return token + + +def get_download_url(token, file_name): + # The URL prefix of the NuScenes dataset. Fetched from the request url. + BASE_URL = ( + "https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/v1.0/" + ) + + headers = { + "authorization": "Bearer " + token, + } + + # From the same source as BASE_URL. + params = { + "region": "us", + "project": "nuScenes", + } + + url = BASE_URL + file_name + response = requests.get( + url, + params=params, + headers=headers, + ) + + download_url = json.loads(response.content)["url"] + + return download_url + + +def main(): + parser = argparse.ArgumentParser(description="Download nuScenes dataset") + parser.add_argument("--username") + parser.add_argument("--password") + parser.add_argument( + "--data_output_dir", type=str, default=utils.get_default_data_output_dir() + ) + args = parser.parse_args() + + data_output_dir = args.data_output_dir + # requests session + with requests.Session() as s: + # login and get auth token + login_token = login(args.username, args.password) + + # ==================== + # NuScenes Trainval Set + # ==================== + file_names = [f"v1.0-trainval{i:02d}_blobs.tgz" for i in range(1, 11)] + + # Metadata + file_names += ["v1.0-trainval_meta.tgz"] + + # ==================== + # NuScenes Test Set + # ==================== + file_names += ["v1.0-test_blobs.tgz"] + + # Metadata + file_names += ["v1.0-test_meta.tgz"] + + # ==================== + # NuScenes Mini + # ==================== + file_names += ["v1.0-mini.tgz"] + + # ==================== + # Shared data + # ==================== + # CAN bus expansion + file_names += ["can_bus.zip"] + + # Map expansion (v1.3) + file_names += ["nuScenes-map-expansion-v1.3.zip"] + + download_links = Parallel(n_jobs=12)( + delayed(get_download_url)(login_token, file_name) + for file_name in tqdm(file_names) + ) + + # write download commands to file + with open("download_commands.txt", "w") as f: + # a command to create the output directory if it does not exist. + f.write(f"mkdir -p {data_output_dir}\n") + # a command to enter the output directory. + f.write(f"cd {data_output_dir}\n") + # commands to download each file. + for i in range(len(download_links)): + command = f'wget -O {file_names[i]} "{download_links[i]}"\n' + f.write(command) + + +if __name__ == "__main__": + main() diff --git a/extract_parallel.py b/extract_parallel.py index 8f27d72..74996eb 100644 --- a/extract_parallel.py +++ b/extract_parallel.py @@ -1,21 +1,96 @@ +import argparse import os -import subprocess +import utils +import zipfile +import tarfile +from typing import List from joblib import Parallel, delayed +from tqdm import tqdm -target_folder = "/net/acadia3a/data/datasets/nuplan" -os.chdir(target_folder) -os.system("ls") +def get_all_files(folder_path) -> List[str]: + """ + Returns a list of all the files in the folder_path. Folders in the folder_path will be ignored. + The returned files will have a path like "folder_path/file". + """ + files = [] + for file in os.listdir(folder_path): + file_path = os.path.join(folder_path, file) + if os.path.isfile(file_path): + files.append(file_path) -def extract_zip(zip_file): - os.system(f"tar -xf {zip_file}") - print(f"Extracted {zip_file}") + return files -if __name__ == "__main__": - zip_files = ["nuplan-v1.1_train_lidar_{}.zip".format(i) for i in range(1, 44)] # Assuming 44 zip files from 0 to 43 +def extract_zip(zip_file_path, output_dir): + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + # Get the list of files to extract. + file_list = zip_ref.namelist() + + # Create the output directory if it doesn't exist. + os.makedirs(output_dir, exist_ok=True) + + # Extract files with progress using tqdm. + for file in tqdm( + file_list, + desc=f"Extracting file {zip_file_path} to {output_dir}", + unit=" files", + ): + zip_ref.extract(file, output_dir) + + +def extract_tar(tar_file_path, output_dir): + with tarfile.open(tar_file_path, "r") as tar_ref: + # Get the list of members (files/directories) to extract. + member_list = tar_ref.getmembers() + + # Create the output directory if it doesn't exist. + os.makedirs(output_dir, exist_ok=True) + + # Extract members with progress using tqdm. + for member in tqdm( + member_list, + desc=f"Extracting file {tar_file_path} to {output_dir}", + unit=" members", + ): + tar_ref.extract(member, output_dir) + + +def extract_compressed_files(file: str, output_dir: str): + """ + Extracts a compressed file. If the file is not ended with ".tgz" or ".zip", report this error and do nothing. + """ + file_extension = os.path.splitext(file)[-1] - # Extract the zip files in parallel - num_processes = 8 # Adjust this number based on your system's capabilities - Parallel(n_jobs=num_processes)(delayed(extract_zip)(zip_file) for zip_file in zip_files) + if file_extension == ".zip": + extract_zip(file, output_dir) + elif file_extension == ".tgz": + extract_tar(file, output_dir) + else: + print(f"Unsupported file {file}!") + return + + print(f"File {file} extracted.") + + +def main(): + parser = argparse.ArgumentParser(description="Extract compressed files.") + # Adjust the num_process based on your system's capabilities. + parser.add_argument("--num_process", type=int, default=8) + parser.add_argument( + "--data_output_dir", type=str, default=utils.get_default_data_output_dir() + ) + args = parser.parse_args() + + data_output_dir = args.data_output_dir + num_process = args.num_process + files = get_all_files(data_output_dir) + + Parallel(n_jobs=num_process)( + delayed(extract_compressed_files)(file, data_output_dir) for file in files + ) + + +if __name__ == "__main__": + main() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..9085a76 --- /dev/null +++ b/utils.py @@ -0,0 +1,2 @@ +def get_default_data_output_dir(): + return "downloads"