diff --git a/image_segmentation/pytorch/.dockerignore b/image_segmentation/pytorch/.dockerignore new file mode 100644 index 000000000..aa3d00cd0 --- /dev/null +++ b/image_segmentation/pytorch/.dockerignore @@ -0,0 +1 @@ +mlcube/workspace/* \ No newline at end of file diff --git a/image_segmentation/pytorch/.gitignore b/image_segmentation/pytorch/.gitignore new file mode 100644 index 000000000..aa3d00cd0 --- /dev/null +++ b/image_segmentation/pytorch/.gitignore @@ -0,0 +1 @@ +mlcube/workspace/* \ No newline at end of file diff --git a/image_segmentation/pytorch/Dockerfile b/image_segmentation/pytorch/Dockerfile index fbe42e6a8..7c41ff8f7 100644 --- a/image_segmentation/pytorch/Dockerfile +++ b/image_segmentation/pytorch/Dockerfile @@ -2,15 +2,17 @@ ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime #ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3 FROM ${FROM_IMAGE_NAME} -ADD . /workspace/unet3d -WORKDIR /workspace/unet3d RUN apt-get update && \ apt-get upgrade -y && \ - apt-get install -y git -RUN apt-get install -y vim + apt-get install -y git vim wget unzip RUN pip install --upgrade pip -RUN pip install --disable-pip-version-check -r requirements.txt +COPY requirements.txt / +RUN pip install --disable-pip-version-check -r /requirements.txt + +ADD . /workspace/unet3d +RUN chmod +x /workspace/unet3d/*.sh +WORKDIR /workspace/unet3d #RUN pip uninstall -y apex; pip uninstall -y apex; git clone --branch seryilmaz/fused_dropout_softmax https://github.com/seryilmaz/apex.git; cd apex; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--xentropy" --global-option="--deprecated_fused_adam" --global-option="--deprecated_fused_lamb" --global-option="--fast_multihead_attn" . diff --git a/image_segmentation/pytorch/download_data.sh b/image_segmentation/pytorch/download_data.sh new file mode 100644 index 000000000..4395c5b47 --- /dev/null +++ b/image_segmentation/pytorch/download_data.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e + +: "${DATASET_PATH:=/}" + +while [ "$1" != "" ]; do + case $1 in + --data_dir=*) + DATASET_PATH="${1#*=}" + ;; + esac + shift +done + +git clone https://github.com/neheller/kits19 +cd kits19 +cp -r data/* $DATASET_PATH +rm -r data/ +ln -s $DATASET_PATH data +pip install -r requirements.txt +python -m starter_code.get_imaging \ No newline at end of file diff --git a/image_segmentation/pytorch/download_demo.sh b/image_segmentation/pytorch/download_demo.sh new file mode 100644 index 000000000..8e34371ad --- /dev/null +++ b/image_segmentation/pytorch/download_demo.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +: "${DATASET_PATH:=/}" + +while [ "$1" != "" ]; do + case $1 in + --data_dir=*) + DATASET_PATH="${1#*=}" + ;; + esac + shift +done + +wget https://mlcube.mlcommons-storage.org/minibenchmarks/3d_unet.zip +unzip -o 3d_unet.zip -d $DATASET_PATH +rm 3d_unet.zip \ No newline at end of file diff --git a/image_segmentation/pytorch/mlcube/README.md b/image_segmentation/pytorch/mlcube/README.md new file mode 100644 index 000000000..ab483036a --- /dev/null +++ b/image_segmentation/pytorch/mlcube/README.md @@ -0,0 +1,56 @@ +# MLCube for 3D Unet + +MLCube™ GitHub [repository](https://github.com/mlcommons/mlcube). MLCube™ [wiki](https://mlcommons.github.io/mlcube/). + +## Project setup + +An important requirement is that you must have Docker installed. + +```bash +# Create Python environment and install MLCube Docker runner +virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker +# Fetch the implementation from GitHub +git clone https://github.com/mlcommons/training && cd ./training/image_segmentation/pytorch/mlcube +``` + +Inside the mlcube directory run the following command to check implemented tasks. + +```shell +mlcube describe +``` + +### MLCube tasks + +Download dataset. + +```shell +mlcube run --task=download_data -Pdocker.build_strategy=always +``` + +Process dataset. + +```shell +mlcube run --task=process_data -Pdocker.build_strategy=always +``` + +Train SSD. + +```shell +mlcube run --task=train -Pdocker.build_strategy=always +``` + +### Execute the complete pipeline + +You can execute the complete pipeline with one single command. + +```shell +mlcube run --task=download_data,process_data,train -Pdocker.build_strategy=always +``` + +## Run a quick demo + +You can run a quick demo that first downloads a tiny dataset and then executes a short training workload. + +```shell +mlcube run --task=download_demo,demo -Pdocker.build_strategy=always +``` diff --git a/image_segmentation/pytorch/mlcube/mlcube.yaml b/image_segmentation/pytorch/mlcube/mlcube.yaml new file mode 100644 index 000000000..022d8bec5 --- /dev/null +++ b/image_segmentation/pytorch/mlcube/mlcube.yaml @@ -0,0 +1,50 @@ +name: 3d_unet +description: Image Segmentation benchmark +authors: + - { name: "MLCommons Best Practices Working Group" } + +platform: + accelerator_count: 1 + +docker: + # Image name. + image: mlcommons/3d_unet:0.0.1 + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "../" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile" + # GPU arguments + gpu_args: "--shm-size=1g --gpus=all" + +tasks: + download_data: + entrypoint: ./download_data.sh -a + parameters: + outputs: + data_dir: data/ + process_data: + entrypoint: ./process_data.sh -a + parameters: + inputs: + data_dir: data/ + outputs: + processed_data: processed_data/ + train: + entrypoint: ./run_mlcube.sh -a + parameters: + inputs: + dataset_dir: processed_data/ + outputs: + log_dir: logs/ + download_demo: + entrypoint: ./download_demo.sh -a + parameters: + outputs: + data_dir: demo_data/ + demo: + entrypoint: ./run_demo.sh -a + parameters: + inputs: + dataset_dir: demo_data/ + outputs: + log_dir: demo_logs/ \ No newline at end of file diff --git a/image_segmentation/pytorch/preprocess_dataset.py b/image_segmentation/pytorch/preprocess_dataset.py index 25fb4f03e..e33069e73 100644 --- a/image_segmentation/pytorch/preprocess_dataset.py +++ b/image_segmentation/pytorch/preprocess_dataset.py @@ -58,7 +58,7 @@ def __init__(self, args): def preprocess_dataset(self): os.makedirs(self.results_dir, exist_ok=True) print(f"Preprocessing {self.data_dir}") - for case in sorted([f for f in os.listdir(self.data_dir) if "case" in f]): + for case in tqdm(sorted([f for f in os.listdir(self.data_dir) if "case" in f])): case_id = int(case.split("_")[1]) if case_id in EXCLUDED_CASES or case_id >= MAX_ID: print("Case {}. Skipped.".format(case_id)) diff --git a/image_segmentation/pytorch/process_data.sh b/image_segmentation/pytorch/process_data.sh new file mode 100644 index 000000000..4f081a055 --- /dev/null +++ b/image_segmentation/pytorch/process_data.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +: "${data_dir:=/}" +: "${processed_data:=/}" + +while [ $# -gt 0 ]; do + case "$1" in + --data_dir=*) + DATA_DIR="${1#*=}" + ;; + --processed_data=*) + PROCESSED_DATA="${1#*=}" + ;; + *) ;; + esac + shift +done + +python preprocess_dataset.py --data_dir $DATA_DIR --results_dir $PROCESSED_DATA \ No newline at end of file diff --git a/image_segmentation/pytorch/run_demo.sh b/image_segmentation/pytorch/run_demo.sh new file mode 100644 index 000000000..b6f9b1a0f --- /dev/null +++ b/image_segmentation/pytorch/run_demo.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -e + +# runs benchmark and reports time to convergence +# to use the script: +# run_and_time.sh + +: "${SEED:=0}" +: "${DATASET_DIR:=/data}" +: "${LOG_DIR:=/results}" + +while [ $# -gt 0 ]; do + case "$1" in + --dataset_dir=*) + DATASET_DIR="${1#*=}" + ;; + --log_dir=*) + LOG_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + +ln -s $LOG_DIR /results + +MAX_EPOCHS=50 +QUALITY_THRESHOLD="0.908" +START_EVAL_AT=50 +EVALUATE_EVERY=1 +LEARNING_RATE="0.8" +LR_WARMUP_EPOCHS=1 +BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 + + +if [ -d ${DATASET_DIR} ] +then + # start timing + start=$(date +%s) + start_fmt=$(date +%Y-%m-%d\ %r) + echo "STARTING TIMING RUN AT $start_fmt" + +# CLEAR YOUR CACHE HERE + python -c " +from mlperf_logging.mllog import constants +from runtime.logging import mllog_event +mllog_event(key=constants.CACHE_CLEAR, value=True)" + + python main.py --data_dir ${DATASET_DIR} \ + --epochs ${MAX_EPOCHS} \ + --evaluate_every ${EVALUATE_EVERY} \ + --start_eval_at ${START_EVAL_AT} \ + --quality_threshold ${QUALITY_THRESHOLD} \ + --batch_size ${BATCH_SIZE} \ + --optimizer sgd \ + --ga_steps ${GRADIENT_ACCUMULATION_STEPS} \ + --learning_rate ${LEARNING_RATE} \ + --seed ${SEED} \ + --lr_warmup_epochs ${LR_WARMUP_EPOCHS} + + # end timing + end=$(date +%s) + end_fmt=$(date +%Y-%m-%d\ %r) + echo "ENDING TIMING RUN AT $end_fmt" + + + # report result + result=$(( $end - $start )) + result_name="image_segmentation" + + + echo "RESULT,$result_name,$SEED,$result,$USER,$start_fmt" +else + echo "Directory ${DATASET_DIR} does not exist" +fi \ No newline at end of file diff --git a/image_segmentation/pytorch/run_mlcube.sh b/image_segmentation/pytorch/run_mlcube.sh new file mode 100644 index 000000000..96a2a0eb9 --- /dev/null +++ b/image_segmentation/pytorch/run_mlcube.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -e + +# runs benchmark and reports time to convergence +# to use the script: +# run_and_time.sh + +: "${SEED:=0}" +: "${DATASET_DIR:=/data}" +: "${LOG_DIR:=/results}" + +while [ $# -gt 0 ]; do + case "$1" in + --dataset_dir=*) + DATASET_DIR="${1#*=}" + ;; + --log_dir=*) + LOG_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + +ln -s $LOG_DIR /results + +MAX_EPOCHS=4000 +QUALITY_THRESHOLD="0.908" +START_EVAL_AT=1000 +EVALUATE_EVERY=20 +LEARNING_RATE="0.8" +LR_WARMUP_EPOCHS=200 +BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 + + +if [ -d ${DATASET_DIR} ] +then + # start timing + start=$(date +%s) + start_fmt=$(date +%Y-%m-%d\ %r) + echo "STARTING TIMING RUN AT $start_fmt" + +# CLEAR YOUR CACHE HERE + python -c " +from mlperf_logging.mllog import constants +from runtime.logging import mllog_event +mllog_event(key=constants.CACHE_CLEAR, value=True)" + + python main.py --data_dir ${DATASET_DIR} \ + --epochs ${MAX_EPOCHS} \ + --evaluate_every ${EVALUATE_EVERY} \ + --start_eval_at ${START_EVAL_AT} \ + --quality_threshold ${QUALITY_THRESHOLD} \ + --batch_size ${BATCH_SIZE} \ + --optimizer sgd \ + --ga_steps ${GRADIENT_ACCUMULATION_STEPS} \ + --learning_rate ${LEARNING_RATE} \ + --seed ${SEED} \ + --lr_warmup_epochs ${LR_WARMUP_EPOCHS} + + # end timing + end=$(date +%s) + end_fmt=$(date +%Y-%m-%d\ %r) + echo "ENDING TIMING RUN AT $end_fmt" + + + # report result + result=$(( $end - $start )) + result_name="image_segmentation" + + + echo "RESULT,$result_name,$SEED,$result,$USER,$start_fmt" +else + echo "Directory ${DATASET_DIR} does not exist" +fi \ No newline at end of file