Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

# Files
download_links.txt
download_commands.txt
downloads/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
23 changes: 21 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,33 @@
# NuPlan-Download-CLI
# NuScenes-Download-CLI

This repo provides the scripts to download various datasets in [NuScenes](https://www.nuscenes.org/).

## Usage

### NuPlan

```bash
python download_nuplan.py --username <username> --password <password>
wget -i download_links.txt
```

The URLs should be valid for about five days. If you need to download the files again, just run the script again to generate the URLs. Godspeed with the terrabytes of downloads and good luck choking and hogging your entire team's bandwidth.

### NuScenes

```bash
python download_nuscenes.py --username <username> --password <password>
sh download_commands.txt
```

### Extract the data

After you download the data, you can extract the download data using the following python script.

```bash
python extract_parallel.py
```

## Why
The NuScenes team for some reason keeps these links behind a convoluted authentation and token expiration system. And that makes downloading them super hard unless you want to keep a browser open for 5 days straight or use a CurlWget extension for 165 different links individually. This script automates that process by reverse engineering the authentation system and capturing the bearer tokens responsible for generating those temporary URLs for download. You can then download those files using wget very easily! Have fun <3

The NuScenes team for some reason keeps these links behind a convoluted authentation and token expiration system. And that makes downloading them super hard unless you want to keep a browser open for 5 days straight or use a CurlWget extension for 165 different links individually. This script automates that process by reverse engineering the authentation system and capturing the bearer tokens responsible for generating those temporary URLs for download. You can then download those files using wget very easily! Have fun <3
48 changes: 38 additions & 10 deletions download_nuplan.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from joblib import Parallel, delayed
from tqdm import tqdm

BASE_URL = "https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/nuplan-v1.1/"
BASE_URL = (
"https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/nuplan-v1.1/"
)


def login(username, password):
Expand Down Expand Up @@ -66,28 +68,52 @@ def main():

# nuPlan Test Set
## Lidars
links = [BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_lidar_{i}.zip" for i in range(12)]
links = [
BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_lidar_{i}.zip"
for i in range(12)
]
## Cameras
links += [BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_camera_{i}.zip" for i in range(12)]
links += [
BASE_URL + f"sensor_blobs/test_set/nuplan-v1.1_test_camera_{i}.zip"
for i in range(12)
]
# nuPlan Train Set
## Lidars
links += [BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_lidar_{i}.zip" for i in range(43)]
links += [
BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_lidar_{i}.zip"
for i in range(43)
]
## Cameras
links += [BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_camera_{i}.zip" for i in range(43)]
links += [
BASE_URL + f"sensor_blobs/train_set/nuplan-v1.1_train_camera_{i}.zip"
for i in range(43)
]
# nuScenes Val Set
## Lidars
links += [BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_lidar_{i}.zip" for i in range(12)]
links += [
BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_lidar_{i}.zip"
for i in range(12)
]
## Cameras
links += [BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_camera_{i}.zip" for i in range(12)]
links += [
BASE_URL + f"sensor_blobs/val_set/nuplan-v1.1_val_camera_{i}.zip"
for i in range(12)
]
# Maps
links += [BASE_URL + "nuplan-maps-v1.0.zip"]
# Mini Split
links += [BASE_URL + "nuplan-v1.1_mini.zip"]
# Mini Sensors
## Lidars
links += [BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_lidar_{i}.zip" for i in range(9)]
links += [
BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_lidar_{i}.zip"
for i in range(9)
]
## Cameras
links += [BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_camera_{i}.zip" for i in range(9)]
links += [
BASE_URL + f"sensor_blobs/mini_set/nuplan-v1.1_mini_camera_{i}.zip"
for i in range(9)
]
# Log DB Train Splits
links += [
BASE_URL + f"nuplan-v1.1_train_{city}.zip"
Expand All @@ -108,7 +134,9 @@ def main():
# Log DB Test Splits
links += [BASE_URL + "nuplan-v1.1_test.zip"]

download_links = Parallel(n_jobs=12)(delayed(get_download_url)(login_token, link) for link in tqdm(links))
download_links = Parallel(n_jobs=12)(
delayed(get_download_url)(login_token, link) for link in tqdm(links)
)

# write download links to file
with open("download_links.txt", "w") as f:
Expand Down
126 changes: 126 additions & 0 deletions download_nuscenes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import argparse
import json
import requests
import utils

from joblib import Parallel, delayed
from tqdm import tqdm


def login(username, password):
headers = {
"content-type": "application/x-amz-json-1.1",
"x-amz-target": "AWSCognitoIdentityProviderService.InitiateAuth",
}

data = (
'{"AuthFlow":"USER_PASSWORD_AUTH","ClientId":"7fq5jvs5ffs1c50hd3toobb3b9","AuthParameters":{"USERNAME":"'
+ username
+ '","PASSWORD":"'
+ password
+ '"},"ClientMetadata":{}}'
)

response = requests.post(
"https://cognito-idp.us-east-1.amazonaws.com/",
headers=headers,
data=data,
)

token = json.loads(response.content)["AuthenticationResult"]["IdToken"]

return token


def get_download_url(token, file_name):
# The URL prefix of the NuScenes dataset. Fetched from the request url.
BASE_URL = (
"https://o9k5xn5546.execute-api.us-east-1.amazonaws.com/v1/archives/v1.0/"
)

headers = {
"authorization": "Bearer " + token,
}

# From the same source as BASE_URL.
params = {
"region": "us",
"project": "nuScenes",
}

url = BASE_URL + file_name
response = requests.get(
url,
params=params,
headers=headers,
)

download_url = json.loads(response.content)["url"]

return download_url


def main():
parser = argparse.ArgumentParser(description="Download nuScenes dataset")
parser.add_argument("--username")
parser.add_argument("--password")
parser.add_argument(
"--data_output_dir", type=str, default=utils.get_default_data_output_dir()
)
args = parser.parse_args()

data_output_dir = args.data_output_dir
# requests session
with requests.Session() as s:
# login and get auth token
login_token = login(args.username, args.password)

# ====================
# NuScenes Trainval Set
# ====================
file_names = [f"v1.0-trainval{i:02d}_blobs.tgz" for i in range(1, 11)]

# Metadata
file_names += ["v1.0-trainval_meta.tgz"]

# ====================
# NuScenes Test Set
# ====================
file_names += ["v1.0-test_blobs.tgz"]

# Metadata
file_names += ["v1.0-test_meta.tgz"]

# ====================
# NuScenes Mini
# ====================
file_names += ["v1.0-mini.tgz"]

# ====================
# Shared data
# ====================
# CAN bus expansion
file_names += ["can_bus.zip"]

# Map expansion (v1.3)
file_names += ["nuScenes-map-expansion-v1.3.zip"]

download_links = Parallel(n_jobs=12)(
delayed(get_download_url)(login_token, file_name)
for file_name in tqdm(file_names)
)

# write download commands to file
with open("download_commands.txt", "w") as f:
# a command to create the output directory if it does not exist.
f.write(f"mkdir -p {data_output_dir}\n")
# a command to enter the output directory.
f.write(f"cd {data_output_dir}\n")
# commands to download each file.
for i in range(len(download_links)):
command = f'wget -O {file_names[i]} "{download_links[i]}"\n'
f.write(command)


if __name__ == "__main__":
main()
99 changes: 87 additions & 12 deletions extract_parallel.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,96 @@
import argparse
import os
import subprocess
import utils
import zipfile
import tarfile

from typing import List
from joblib import Parallel, delayed
from tqdm import tqdm

target_folder = "/net/acadia3a/data/datasets/nuplan"
os.chdir(target_folder)
os.system("ls")

def get_all_files(folder_path) -> List[str]:
"""
Returns a list of all the files in the folder_path. Folders in the folder_path will be ignored.
The returned files will have a path like "folder_path/file".
"""
files = []
for file in os.listdir(folder_path):
file_path = os.path.join(folder_path, file)
if os.path.isfile(file_path):
files.append(file_path)

def extract_zip(zip_file):
os.system(f"tar -xf {zip_file}")
print(f"Extracted {zip_file}")
return files


if __name__ == "__main__":
zip_files = ["nuplan-v1.1_train_lidar_{}.zip".format(i) for i in range(1, 44)] # Assuming 44 zip files from 0 to 43
def extract_zip(zip_file_path, output_dir):
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
# Get the list of files to extract.
file_list = zip_ref.namelist()

# Create the output directory if it doesn't exist.
os.makedirs(output_dir, exist_ok=True)

# Extract files with progress using tqdm.
for file in tqdm(
file_list,
desc=f"Extracting file {zip_file_path} to {output_dir}",
unit=" files",
):
zip_ref.extract(file, output_dir)


def extract_tar(tar_file_path, output_dir):
with tarfile.open(tar_file_path, "r") as tar_ref:
# Get the list of members (files/directories) to extract.
member_list = tar_ref.getmembers()

# Create the output directory if it doesn't exist.
os.makedirs(output_dir, exist_ok=True)

# Extract members with progress using tqdm.
for member in tqdm(
member_list,
desc=f"Extracting file {tar_file_path} to {output_dir}",
unit=" members",
):
tar_ref.extract(member, output_dir)


def extract_compressed_files(file: str, output_dir: str):
"""
Extracts a compressed file. If the file is not ended with ".tgz" or ".zip", report this error and do nothing.
"""
file_extension = os.path.splitext(file)[-1]

# Extract the zip files in parallel
num_processes = 8 # Adjust this number based on your system's capabilities
Parallel(n_jobs=num_processes)(delayed(extract_zip)(zip_file) for zip_file in zip_files)
if file_extension == ".zip":
extract_zip(file, output_dir)
elif file_extension == ".tgz":
extract_tar(file, output_dir)
else:
print(f"Unsupported file {file}!")
return

print(f"File {file} extracted.")


def main():
parser = argparse.ArgumentParser(description="Extract compressed files.")
# Adjust the num_process based on your system's capabilities.
parser.add_argument("--num_process", type=int, default=8)
parser.add_argument(
"--data_output_dir", type=str, default=utils.get_default_data_output_dir()
)
args = parser.parse_args()

data_output_dir = args.data_output_dir
num_process = args.num_process
files = get_all_files(data_output_dir)

Parallel(n_jobs=num_process)(
delayed(extract_compressed_files)(file, data_output_dir) for file in files
)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def get_default_data_output_dir():
return "downloads"