diff --git a/CMakeLists.txt b/CMakeLists.txt index e7504d702..81845dd7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,10 +158,6 @@ endif() # option for nccl option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF) -if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_USE_NCCL STREQUAL "ON") - message(FATAL_ERROR "NCCL: ON for FF_GPU_BACKEND: hip_rocm. hip_rocm backend must have NCCL disabled.") -endif() - # option for avx2 option(FF_USE_AVX2 "Run FlexFlow with AVX2" OFF) @@ -224,7 +220,9 @@ endif() # NCCL if(FF_USE_NCCL) - include(nccl) + if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") + include(nccl) + endif() list(APPEND FF_CC_FLAGS -DFF_USE_NCCL) list(APPEND FF_NVCC_FLAGS @@ -369,11 +367,13 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") find_package(hipblas REQUIRED) find_package(miopen REQUIRED) + if(FF_USE_NCCL) + find_package(rccl REQUIRED) + endif() # find_package(rocrand REQUIRED) find_library(HIP_RAND_LIBRARY hiprand REQUIRED) add_compile_definitions(FF_USE_HIP_ROCM) - # The hip cmake config module defines three targets, # hip::amdhip64, hip::host, and hip::device. # @@ -387,12 +387,15 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") # Docs (outdated): # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY}) + if(FF_USE_NCCL) + target_link_libraries(flexflow rccl) + endif() endif() else() message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}") endif() -if(FF_USE_NCCL) +if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")) add_dependencies(flexflow ${NCCL_NAME}) endif() diff --git a/cmake/json.cmake b/cmake/json.cmake index 63ac50b20..3cf57a786 100644 --- a/cmake/json.cmake +++ b/cmake/json.cmake @@ -1,4 +1 @@ -include(FetchContent) - -FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz) -FetchContent_MakeAvailable(json) \ No newline at end of file +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json) diff --git a/config/config.inc b/config/config.inc index ebc6b9cb4..6497dae40 100644 --- a/config/config.inc +++ b/config/config.inc @@ -84,6 +84,8 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx" SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL" + elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then + SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ofi" fi elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx" @@ -182,7 +184,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then chmod +x "$(pwd)/nvidia_hipcc" SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc" else - SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc" + SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip -DCMAKE_CXX_FLAGS='-I${MPICH_DIR}/include' -DCMAKE_EXE_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi' -DCMAKE_SHARED_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi'" fi fi fi diff --git a/config/config.linux b/config/config.linux index 04908d81b..d3729aea4 100755 --- a/config/config.linux +++ b/config/config.linux @@ -38,7 +38,7 @@ FF_USE_PYTHON=${FF_USE_PYTHON:-ON} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-} # select GASNET conduit -FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} +FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi} # set UCX URL FF_UCX_URL=${FF_UCX_URL:-""} @@ -70,11 +70,9 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid." exit 1 -elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then +elif [["$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm"]]; then # enable NCCL FF_USE_NCCL=${FF_USE_NCCL:-ON} -else - FF_USE_NCCL=OFF fi function get_build_configs() { diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 41b84a269..5dff7415d 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -3,16 +3,17 @@ import sys import numpy as np +import torch from flexflow.core import * from flexflow.torch.model import PyTorchModel -from transformers import MT5ForConditionalGeneration, T5Tokenizer - +#from transformers import MT5ForConditionalGeneration, T5Tokenizer +from transformers import BertForMaskedLM, BertTokenizer sys.path.append("./examples/python/pytorch/mt5") from mt5_torch import DataPreparer, get_dataloaders, set_seed BASE_DIR = "examples/python/pytorch/mt5" DATA_DIR = os.path.join(BASE_DIR, "data") -NUMPY_DIR = os.path.join(DATA_DIR, "numpy") +NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle") def data_to_numpy() -> None: @@ -28,7 +29,8 @@ def data_to_numpy() -> None: """ model_params = { "SEED": 42, - "MODEL": "google/mt5-small", + #"MODEL": "google/mt5-small", + "MODEL": "bert-base-uncased", "TRAIN_BATCH_SIZE": None, # use the full dataset as one batch "EVAL_BATCH_SIZE": None, # use the full dataset as one batch "TRAIN_EPOCHS": 1, # unused @@ -36,7 +38,8 @@ def data_to_numpy() -> None: "MAX_TARGET_TEXT_LENGTH": 48, } set_seed(model_params) - tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"]) print("Getting dataloaders...") train_loader, eval_loader = get_dataloaders(tokenizer, model_params) assert len(train_loader) == 1 @@ -61,8 +64,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -81,36 +84,54 @@ def preprocess_train() -> None: def top_level_task(): ffconfig = FFConfig() ffmodel = FFModel(ffconfig) - model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") - + #model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") + model = BertForMaskedLM.from_pretrained("bert-base-uncased") + #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy")) - mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy")) - y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') + ids = np.pad(ids, ((0,0), (0,17)), 'constant') + #ids = np.random.randint(0, 5, (1000, 512)) + #print('ids_shape', ids.shape) + #print('ids', ids) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') + mask = np.pad(mask, ((0,0), (0,17)), 'constant') + #mask = np.random.randint(0, 2, (1000, 512)) + #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') + lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') + #lm_labels = np.random.randint(-1, 5, (1000, 512)) + position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) - decoder_input_ids_shape = (batch_size, y_ids.shape[1]) + #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask - ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask + #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] - decoder_seq_length = y_ids.shape[1] - seq_length = (encoder_seq_length, decoder_seq_length) - input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + #decoder_seq_length = y_ids.shape[1] + #seq_length = (encoder_seq_length, decoder_seq_length) + seq_length = encoder_seq_length + #input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + input_names = ["input_ids", "attention_mask"] print("Tracing the model...") + print(batch_size) hf_model = PyTorchModel( model, is_hf_model=True, input_names=input_names, batch_size=batch_size, seq_length=seq_length, ) output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) - ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) + #from flexflow.torch.model import file_to_ff + #file_to_ff("mt5.ff", ffmodel, input_tensors) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) + # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") ffmodel.compile( @@ -121,13 +142,21 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) + + # load weights here + ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") + print('id_dtype', ids.dtype) + print('mask_dtype', mask.dtype) + print('labels_dtype', lm_labels.dtype) input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids) attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask) - decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) + #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype + token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) + position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -138,31 +167,32 @@ def top_level_task(): print("Training...") epochs = ffconfig.epochs ffmodel.fit( - x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) if __name__ == "__main__": - # Generate the .tsv files if needed - if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ - not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): - DataPreparer.data_to_tsv() - # Convert the .tsv files to .npy if needed - if not os.path.exists(NUMPY_DIR): - os.mkdir(NUMPY_DIR) - prefixes = ["train_", "eval_"] - suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] - npy_filenames = [ - pre + suf for pre, suf in itertools.product(prefixes, suffixes) - ] - if any( - not os.path.exists(os.path.join(NUMPY_DIR, filename)) - for filename in npy_filenames - ): - data_to_numpy() - # Preprocess the training data if needed - if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ - not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): - preprocess_train() + ## Generate the .tsv files if needed + #if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ + # not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): + # DataPreparer.data_to_tsv() + ## Convert the .tsv files to .npy if needed + #if not os.path.exists(NUMPY_DIR): + # os.mkdir(NUMPY_DIR) + #prefixes = ["train_", "eval_"] + #suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] + #npy_filenames = [ + # pre + suf for pre, suf in itertools.product(prefixes, suffixes) + #] + #if any( + # not os.path.exists(os.path.join(NUMPY_DIR, filename)) + # for filename in npy_filenames + #): + # data_to_numpy() + ## Preprocess the training data if needed + #if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ + # not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): + # preprocess_train() top_level_task() diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py index 78886eed6..4d741c44a 100644 --- a/examples/python/pytorch/mt5/mt5_torch.py +++ b/examples/python/pytorch/mt5/mt5_torch.py @@ -7,7 +7,7 @@ import os import numpy as np -import pandas as pd +#import pandas as pd import torch from torch.utils.data import DataLoader, Dataset from transformers import MT5ForConditionalGeneration, T5Tokenizer @@ -311,5 +311,5 @@ def TorchMT5Trainer( "MAX_TARGET_TEXT_LENGTH": 48, "LEARNING_RATE": 1e-4, } - device = torch.device(0) + device = torch.device('cpu') TorchMT5Trainer(model_params, device) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index 4cccc9b76..e6fbe298c 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -61,7 +61,11 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + if dim['is_replica_dim']: + is_replica = 'r=t' + else: + is_replica = 'r=f' + toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx} {is_replica}]') return f'TensorShape<{" ".join(toks)}>' class ParallelTensorBasePrinter: @@ -77,9 +81,31 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + tok = f'{i}=[s={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + tok += ']' + toks.append(tok) return f'ParallelTensorBase<{" ".join(toks)}>' +class ParallelDimPrinter: + def __init__(self, val): + self.val = val + + def to_string(self): + size = self.val['size'] + degree = self.val['degree'] + parallel_idx = self.val['parallel_idx'] + tok = f's={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + return f'ParallelDim<{tok}>' + + def build_pretty_printer(): pp = gdb.printing.RegexpCollectionPrettyPrinter( "flexflow") @@ -89,6 +115,7 @@ def build_pretty_printer(): pp.add_printer('Domain', '^Legion::Domain$', DomainPrinter) pp.add_printer('ParallelTensorShape', '^FlexFlow::ParallelTensorShape$', TensorShapePrinter) pp.add_printer('ParallelTensorBase', '^FlexFlow::ParallelTensorBase$', ParallelTensorBasePrinter) + pp.add_printer('ParallelDim', '^FlexFlow::ParallelDim$', ParallelDimPrinter) return pp gdb.printing.register_pretty_printer( diff --git a/include/flexflow/config.h b/include/flexflow/config.h index d82b1377c..b6a27a4f2 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -28,8 +28,10 @@ #error "Unknown device" #endif #include "tl/optional.hpp" -#ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include +#else +#include #endif namespace FlexFlow { @@ -122,6 +124,7 @@ class FFConfig { size_t workSpaceSize; Legion::Context lg_ctx; Legion::Runtime *lg_hlr; + Legion::IndexSpaceT<1> all_gpu_task_is; Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; size_t simulator_work_space_size; @@ -135,6 +138,8 @@ class FFConfig { bool enable_parameter_parallel; bool enable_attribute_parallel; bool enable_inplace_optimizations; + int data_parallelism_degree; + int tensor_parallelism_degree; // Control Tensor Op Math Conversion bool allow_tensor_op_math_conversion; std::string dataset_path; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 5658e2923..060983b02 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -157,6 +157,7 @@ enum OperatorType { OP_REPLICATE, OP_REDUCTION, OP_PIPELINE, + OP_ALLREDUCE, OP_FUSED_PARALLEL, OP_INVALID, }; @@ -189,6 +190,7 @@ enum PMParameter { PM_COMBINE_DEGREE, // Combine PM_REDUCTION_DIM, // Reduction PM_REDUCTION_DEGREE, // Reduction + PM_ALLREDUCE_DIM, // AllReduce PM_SOFTMAX_DIM, // Softmax PM_NUM_HEADS, // MultiHeadAttention PM_INVALID, diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 16ce3ac20..2ddc8549f 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -95,6 +95,8 @@ void flexflow_model_compute_metrics(flexflow_model_t handle); void flexflow_model_update(flexflow_model_t handle); +void flexflow_model_unified_update(flexflow_model_t handle); + void flexflow_model_compile(flexflow_model_t handle, enum LossType loss_type, int *metrics, @@ -278,6 +280,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, + bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index 2e0cf1ca4..2c92eeeb3 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -91,7 +91,7 @@ struct NodeCompare { struct GraphOptimalViewSerialized { #ifdef LEGION_MAX_RETURN_SIZE - static const size_t buffer_size = LEGION_MAX_RETURN_SIZE - 8; + static const size_t buffer_size = 4 * LEGION_MAX_RETURN_SIZE - 8; #else static const size_t buffer_size = 1024 * 1024 - 8; #endif @@ -332,6 +332,8 @@ class Graph { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static GraphOptimalViewSerialized + graph_optimize_wrapper(FFModel * model); Node find_bottleneck_node(Node const &sink_node, Node const &source_node) const; void print_strategy_computation_graph( diff --git a/include/flexflow/initializer.h b/include/flexflow/initializer.h index 062530a65..3c44d1184 100644 --- a/include/flexflow/initializer.h +++ b/include/flexflow/initializer.h @@ -46,7 +46,7 @@ class GlorotUniform : public Initializer { class Op; struct ZeroInitMeta { - static int const MAX_NUM_REGIONS = 64; + static int const MAX_NUM_REGIONS = 128; int num_regions; Op *op_ptr; DataType data_types[MAX_NUM_REGIONS]; diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h index 8843dc4d6..b843555e0 100644 --- a/include/flexflow/machine_view.h +++ b/include/flexflow/machine_view.h @@ -3,8 +3,10 @@ #include "legion.h" #include -#ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include +#else +#include #endif #include "flexflow/config.h" diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cb1b26d62..fe73e6a0e 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -151,6 +151,7 @@ enum TaskIDs { // Optimizer with NCCL SGD_UPD_NCCL_TASK_ID, ADAM_UPD_NCCL_TASK_ID, + ADAM_UNIFY_UPD_NCCL_TASK_ID, // Initializer GLOROT_INIT_TASK_ID, ZERO_INIT_TASK_ID, @@ -190,6 +191,10 @@ enum TaskIDs { PIPELINE_INIT_TASK_ID, PIPELINE_FWD_TASK_ID, PIPELINE_BWD_TASK_ID, + ALLREDUCE_INIT_TASK_ID, + ALLREDUCE_INF_TASK_ID, + ALLREDUCE_FWD_TASK_ID, + ALLREDUCE_BWD_TASK_ID, FUSED_PARALLELOP_INIT_TASK_ID, FUSED_PARALLELOP_FWD_TASK_ID, FUSED_PARALLELOP_BWD_TASK_ID, @@ -273,6 +278,7 @@ class Split; class TopK; class Transpose; class Combine; +class AllReduce; class Repartition; class Reduction; class Replicate; @@ -473,6 +479,7 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor @@ -521,7 +528,10 @@ class FFModel { // Add a flat layer Tensor flat(const Tensor input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL); + Tensor softmax(const Tensor input, + int dim = -1, + bool last_layer = false, + char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, std::vector const &perm, @@ -773,6 +783,7 @@ class FFModel { void get_metrics(); void backward(int seq_length = -1); void update(); + void unified_update(); bool apply_fusion(std::vector const &operators, std::vector &new_operators); Op *get_final_operator() const; @@ -824,6 +835,8 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool is_transformer_block(int layer_idx) const; + bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, std::vector const &inputs); @@ -850,6 +863,7 @@ class FFModel { int metrics_input; ParallelTensor parallel_label_tensor; Tensor label_tensor; + int num_inputs = 0; std::vector layers; std::vector operators; @@ -919,6 +933,8 @@ class FFModel { Replicate *>, std::unordered_map, Reduction *>, + std::unordered_map, + AllReduce *>, std::unordered_map, Combine *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 24c84a85e..84653ac9c 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -7,6 +7,7 @@ #include "flexflow/ops/batch_matmul_params.h" #include "flexflow/ops/cast_params.h" #include "flexflow/ops/concat_params.h" +#include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/ops/conv_2d_params.h" #include "flexflow/ops/dropout_params.h" #include "flexflow/ops/element_binary_params.h" @@ -62,6 +63,7 @@ using OperatorParameters = mp::variant; tl::optional get_op_parameters(Op const *op); diff --git a/include/flexflow/ops/dropout.h b/include/flexflow/ops/dropout.h index 37304bdad..b8033c98b 100644 --- a/include/flexflow/ops/dropout.h +++ b/include/flexflow/ops/dropout.h @@ -5,6 +5,13 @@ #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/dropout_params.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include +#include +#elif defined(FF_USE_HIP_ROCM) +#include +#include +#endif namespace FlexFlow { diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index cfacec50f..677ff23ce 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -53,11 +53,17 @@ class ElementBinary : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); Params get_params() const; public: bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/element_binary_params.h b/include/flexflow/ops/element_binary_params.h index 5aa20e25a..c70e1b597 100644 --- a/include/flexflow/ops/element_binary_params.h +++ b/include/flexflow/ops/element_binary_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct ElementBinaryParams { OperatorType type; + bool inplace_a; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/ops/kernels/dropout_kernels.h b/include/flexflow/ops/kernels/dropout_kernels.h index 421974fba..b2201dd34 100644 --- a/include/flexflow/ops/kernels/dropout_kernels.h +++ b/include/flexflow/ops/kernels/dropout_kernels.h @@ -5,6 +5,7 @@ #include "flexflow/fftype.h" #include "flexflow/op_meta.h" #include "flexflow/ops/dropout.h" +#include "flexflow/accessor.h" namespace FlexFlow { @@ -17,33 +18,40 @@ class DropoutMeta : public OpMeta { ~DropoutMeta(void); Realm::RegionInstance reserveInst; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + curandState *state; cudnnTensorDescriptor_t inputTensor, outputTensor; cudnnDropoutDescriptor_t dropoutDesc; #else miopenTensorDescriptor_t inputTensor, outputTensor; miopenDropoutDescriptor_t dropoutDesc; + hiprandState *state; #endif void *reserveSpace, *dropoutStates; size_t reserveSpaceSize, dropoutStateSize; + size_t num_elements; + long long seed; + float rate; }; namespace Kernels { namespace Dropout { void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, ffStream_t stream); void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, ffStream_t stream); } // namespace Internal } // namespace Dropout diff --git a/include/flexflow/ops/kernels/element_binary_kernels.h b/include/flexflow/ops/kernels/element_binary_kernels.h index 529859195..50c7f2b80 100644 --- a/include/flexflow/ops/kernels/element_binary_kernels.h +++ b/include/flexflow/ops/kernels/element_binary_kernels.h @@ -22,6 +22,8 @@ class ElementBinaryMeta : public OpMeta { OperatorType op_type; bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; + size_t replicate_size; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 81b34d855..9aec9f57c 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -20,6 +20,7 @@ class SoftmaxMeta : public OpMeta { #endif bool profiling; int dim; + bool last_layer; char op_name[MAX_OPNAME]; }; @@ -33,6 +34,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements); namespace Internal { @@ -40,8 +42,10 @@ void forward_kernel(SoftmaxMeta const *m, float const *input_ptr, float *output_ptr, ffStream_t stream); -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 8273b9ab5..de5ed48df 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -63,15 +63,14 @@ class LayerNorm : public Op { static void forward_kernel(LayerNormMeta const *m, T const *input_ptr, T *output_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, ffStream_t stream); - template static void forward_kernel_wrapper(LayerNormMeta const *m, - T const *input_ptr, - T *output_ptr, - T *gamma_ptr, - T *beta_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -105,7 +104,7 @@ class LayerNormMeta : public OpMeta { bool elementwise_affine; int64_t effective_batch_size, effective_num_elements; float eps; - float *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 25a20315b..2616294a3 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -15,6 +15,7 @@ class Softmax : public Op { Softmax(FFModel &model, const ParallelTensor logit, int dim, + bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -64,6 +65,7 @@ class Softmax : public Op { public: int dim; + bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index d805d9966..545e3a5cb 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct SoftmaxParams { int dim; + bool last_layer; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/ops/split.h b/include/flexflow/ops/split.h index 633268ffb..6c0736a76 100644 --- a/include/flexflow/ops/split.h +++ b/include/flexflow/ops/split.h @@ -50,6 +50,8 @@ class Split : public Op { Params get_params() const; + tl::optional as_dot() const override; + public: int legion_axis; std::vector splits; diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index bab7e6e4e..401fffb35 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -18,6 +18,7 @@ #include "flexflow/parallel_tensor.h" #include "legion.h" +#include "accessor.h" namespace FlexFlow { @@ -30,6 +31,7 @@ class Optimizer { virtual void init(void) = 0; virtual void next(void) = 0; virtual void update(const ParallelTensor p) = 0; + virtual void unified_update(std::vector const parameters) = 0; FFModel const *model; }; @@ -43,6 +45,7 @@ class SGDOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); + void unified_update(std::vector const parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -60,6 +63,11 @@ class SGDOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + nccl_unified_update_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void nccl_update_task_gpu(SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, @@ -85,6 +93,7 @@ class AdamOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); + void unified_update(std::vector const parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -103,6 +112,11 @@ class AdamOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + nccl_unified_update_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void nccl_update_task_gpu(AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, @@ -110,10 +124,19 @@ class AdamOptimizer : public Optimizer { float *w_ptr, float *v_ptr, float *m_ptr); + static void nccl_unified_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs); #endif double alpha, beta1, beta2, weight_decay, epsilon; double alpha_t, beta1_t, beta2_t; std::map v_values, m_values; + size_t reservedWorkSpaceSize = 0; + int parameters_num = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h new file mode 100644 index 000000000..a28d4cef9 --- /dev/null +++ b/include/flexflow/parallel_ops/allreduce.h @@ -0,0 +1,57 @@ +#ifndef _FLEXFLOW_ALLREDUCE_H +#define _FLEXFLOW_ALLREDUCE_H + +#include "flexflow/layer.h" +#include "flexflow/node.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_ops/allreduce_params.h" +#include "parallel_op.h" + +namespace FlexFlow { + +class AllReduce : public ParallelOp { +public: + using Params = AllReduceParams; + using Input = ParallelTensor; + + AllReduce(FFModel &model, + const ParallelTensor input, + int allreduce_legion_dim, + char const *name = NULL); + AllReduce(FFModel &model, + Params const ¶ms, + Input const input, + char const *name = nullptr); + void create_input_partition(FFModel &model) override; + void init(FFModel const &) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + bool get_int_parameter(PMParameter, int *) const override; + bool append_parallel_op_info( + std::vector ¶llel_ops) const override; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + + Params get_params() const; + +public: + int allreduce_dim; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_ALLREDUCE_H diff --git a/include/flexflow/parallel_ops/allreduce_params.h b/include/flexflow/parallel_ops/allreduce_params.h new file mode 100644 index 000000000..a0daac8f9 --- /dev/null +++ b/include/flexflow/parallel_ops/allreduce_params.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_ALLREDUCE_PARAMS_H +#define _FLEXFLOW_ALLREDUCE_PARAMS_H + +namespace FlexFlow { + +struct AllReduceParams { + int allreduce_legion_dim; + char name[MAX_OPNAME]; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(AllReduceParams const &, AllReduceParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::AllReduceParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_ALLREDUCE_PARAMS_H diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h new file mode 100644 index 000000000..02a5026fc --- /dev/null +++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/allreduce.h" + +namespace FlexFlow { + +class AllReduceMeta : public OpMeta { +public: + AllReduceMeta(FFHandler handle, AllReduce const *reduct); +}; + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 381f690cd..ac41a6437 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -31,6 +31,10 @@ class Replicate : public ParallelOp { bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; + static void init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -39,6 +43,21 @@ class Replicate : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + + template + static void + forward_task_with_type(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + template + static void backward_task_with_type( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/parallel_tensor.h b/include/flexflow/parallel_tensor.h index db77b4903..d98ffdc66 100644 --- a/include/flexflow/parallel_tensor.h +++ b/include/flexflow/parallel_tensor.h @@ -101,6 +101,7 @@ struct ParallelTensorShape { RecordFormatter as_dot() const; size_t get_piece_size() const; + size_t get_piece_num_elements() const; bool is_valid() const; int get_num_replica_dims() const; diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 46e323b18..d07799588 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -82,6 +82,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + Legion::coord_t origin_size, + Legion::coord_t size); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); @@ -132,9 +138,15 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +void save_tensor(T const *ptr, size_t num_elements, char const *file_name); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); +cudnnStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, + Legion::Domain domain, + DataType data_type = DT_FLOAT); cudaDataType_t ff_to_cuda_datatype(DataType type); diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 697083223..8c589305c 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -19,7 +19,7 @@ do { \ std::stringstream _error; \ if (status != miopenStatusSuccess) { \ - _error << "CUDNN failure: " << status; \ + _error << "CUDNN failure: " << miopenGetErrorString(status); \ FatalError(_error.str()); \ } \ } while (0) @@ -82,6 +82,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + Legion::coord_t origin_size, + Legion::coord_t size); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); @@ -137,6 +143,10 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, Legion::Domain domain); +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Legion::Domain domain); + hipblasDatatype_t ff_to_cuda_datatype(DataType type); miopenDataType_t ff_to_cudnn_datatype(DataType type); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 4eab2155c..750838d82 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1613,7 +1613,7 @@ def flat(self, input, name=None): self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) - def softmax(self, input, axis=-1, name=None): + def softmax(self, input, axis=-1, last_layer=False, name=None): """Softmax activation function. :param input: the input Tensor. @@ -1625,7 +1625,7 @@ def softmax(self, input, axis=-1, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) + handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, last_layer, c_name) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -2018,6 +2018,13 @@ def update(self): :returns: None -- no returns. """ ffc.flexflow_model_update(self.handle) + + def unified_update(self): + """Update weights and biases of all layers. + + :returns: None -- no returns. + """ + ffc.flexflow_model_unified_update(self.handle) def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): """Configure the model for trainting. FlexFlow uses lazy initialization, @@ -2059,6 +2066,25 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") + def load_bert_pretrained(self, checkpoint=None): + # store weights in dict + weights_dict = {} + for name, params in checkpoint.named_parameters(): + weights_dict[name.replace("LayerNorm", "layer_norm").replace(".", "_")] = params.detach().cpu().numpy() + print(name.replace("LayerNorm", "layer_norm").replace(".", "_")) + # some weights not in params + weights_dict['cls_predictions_decoder_weight'] = checkpoint.cls.predictions.decoder.weight.detach().cpu().numpy() + weights_dict['cls_predictions_decoder_bias'] = checkpoint.cls.predictions.decoder.bias.detach().cpu().numpy() + for i in range (self._nb_layers): + layer = self._layers[i] + if (layer.name + "_weight") in weights_dict: + print('weight: ' + layer.name) + weight = layer.get_parameter_by_id(0) + weight.set_tensor(self, weights_dict[layer.name + "_weight"]) + if (layer.name + "_bias") in weights_dict: + print('bias: ' + layer.name) + bias = layer.get_parameter_by_id(1) + bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). @@ -2098,7 +2124,7 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1): for d in dataloaders: d.next_batch(self) self.forward() - self.zero_gradients() + # self.zero_gradients() self.backward() self.update() self._ffconfig.end_trace(self._tracing_id) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 65b1669e9..8ebac2146 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1,4 +1,4 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# Copyright 2020 Stanford University, Los Alamos National Laboratory # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ from collections import OrderedDict from enum import Enum from typing import List -import copy import numpy as np from flexflow.core.flexflow_cffi import Tensor, NormInitializer @@ -26,6 +25,7 @@ try: import torch + print(torch.__version__) from torch.fx.immutable_collections import immutable_dict except: pass @@ -653,14 +653,26 @@ def string_to_ff(string, ffmodel, node_to_output): data = Node.StringData(string) name = data.name input_tensor = node_to_output[data.innodes[0]] - return ffmodel.identity(input=input_tensor, name=name) - # TODO: Change to ffmodel.layernorm() once supported + axes = [len(input_tensor.dims) - 1] + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=1e-6, + name=name, + ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - return ffmodel.identity(input=input_tensor, name=self.name) - # TODO: Change to ffmodel.layernorm() once supported - + axes = [0] + eps = self.module.eps + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=eps, + name=self.name, + ) class T5LayerNormNode(Node): """ @@ -931,7 +943,7 @@ def construct_node(node): elif name.find("contiguous") >= 0: return ContiguousNode(node) elif name.find("tanh") >= 0: return TanhFNode(node) elif name.find("gelu") >= 0: return GeluFNode(node) - assert 0, f"Unknown function or method: {name}" + assert 0, f"Unknown function or method: {name} {node}" @staticmethod def is_right_scalar_op(node): @@ -1186,16 +1198,24 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=self.name, - ) - + if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, inplace=False, name=self.name, + ) + else: + negative_input = ffmodel.scalar_multiply( + input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', + ) + return ffmodel.scalar_sub( + input=negative_input, scalar=-scalar, inplace=False, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1220,15 +1240,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float + return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1409,6 +1430,10 @@ def to_ff(self, ffmodel, node_to_output): @staticmethod def slice_tensor(ffmodel, tensor, slices, name): + + print('slices', slices) + old_shape = tensor.dims + print('old_shape', tensor.dims) """Returns a reshaped tensor based on the given slices.""" def is_colon(slice_elem): """Returns if the slice is equivalent to `:`.""" @@ -1424,11 +1449,20 @@ def is_truncate(slice_elem, old_size): stop = old_size if slice_elem.stop == None else slice_elem.stop new_size = stop - start return new_size < old_size - + def is_single_element(slice_elem): return isinstance(slice_elem, int) + def is_exact(slice_elem, old_size): + if slice_elem is None: + return False + start = 0 if slice_elem.start == None else slice_elem.start + stop = old_size if slice_elem.stop == None else slice_elem.stop + new_size = stop - start + return new_size == old_size + shape = tensor.dims + print('input dims', tensor.dims) # Fewer slices than input dimensions diff = len(shape) - len(slices) @@ -1441,12 +1475,18 @@ def is_single_element(slice_elem): # Match dimensions from right to left new_shape = [] # append then reverse j = len(shape) - 1 + import copy curr_tensor = copy.copy(tensor) for slice_elem in reversed(slices): - if is_colon(slice_elem): + print('slice_elem', slice_elem) + if is_colon(slice_elem) or is_exact(slice_elem, shape[j]): + print('shape', shape) assert j >= 0 + print('j', j) + print('new_shape_bef', new_shape) new_shape.append(shape[j]) + print('new_shape_aft', new_shape) j -= 1 elif is_unsqueeze(slice_elem): new_shape.append(1) @@ -1456,6 +1496,8 @@ def is_single_element(slice_elem): curr_tensor = ffmodel.split(input=curr_tensor, sizes=splits, axis=j, name=name)[0] new_shape.append(1) j -= 1 + elif is_exact(slice_elem, shape[j]): + print('exact') elif is_truncate(slice_elem, shape[j]): assert j >= 0 start = 0 if slice_elem.start == None else slice_elem.start @@ -1481,8 +1523,45 @@ def is_single_element(slice_elem): assert 0, f"Unsupported slice element: {slice_elem}" new_shape.reverse() - return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) - + if len(new_shape) == 0: + return curr_tensor + else: + print('new_shape', new_shape) + if old_shape == new_shape: + return curr_tensor + return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) + + + +# """Returns a reshaped tensor based on the given slices.""" +# def is_colon(slice_elem): +# """Returns if the slice is equivalent to `:`.""" +# return slice_elem == slice(None, None, None) +# +# def is_unsqueeze(slice_elem): +# """Returns if the slice is equivalent to unsqueezing that +# dimension.""" +# return slice_elem is None +# shape = tensor.dims +# # Match dimensions from right to left +# new_shape = [] # append then reverse +# j = len(shape) - 1 +# for slice_elem in reversed(slices): +# if is_colon(slice_elem): +# assert j >= 0 +# new_shape.append(shape[j]) +# j -= 1 +# elif is_unsqueeze(slice_elem): +# new_shape.append(1) +# else: +# assert 0, f"Unsupported slice element: {slice_elem}" +# new_shape.reverse() +# return ffmodel.reshape( +# input=tensor, shape=new_shape, name=name, +# ) + + + @staticmethod def strings_to_slices(strings: List[str]): # Extract slice elements @@ -1583,14 +1662,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1751,7 +1830,7 @@ def __init__(self, node): def parse(self): s = [self.name] scalar = self.innodes[1] - if type(scalar) is not int or type(scalar) is not float: + if not isinstance(scalar, [int, float]): assert 0, "FlexFlow does not support tensor floor division" innodes = (self.innodes[0],) s.append(self.parse_inoutnodes(innodes)) @@ -2290,11 +2369,16 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output): - return self.attr_to_ff_tensor(ffmodel) + def to_ff(self, ffmodel, node_to_output, input_tensors): + return self.attr_to_ff_tensor(ffmodel, input_tensors) + + def attr_to_ff_tensor(self, ffmodel, input_tensors): - def attr_to_ff_tensor(self, ffmodel): - torch_tensor = self.attr + + torch_tensor = self.attr + assert (torch_tensor.shape[0] == 1) + batch_size = ffmodel._ffconfig.batch_size + torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) ff_dtype = Node.torch_to_ff_dtype(torch_tensor.dtype) requires_grad = torch_tensor.requires_grad @@ -2309,14 +2393,17 @@ def attr_to_ff_tensor(self, ffmodel): ff_dtype = DataType.DT_FLOAT np_tensor = np_tensor.astype(np.float32) + print('attr: ', torch_tensor.shape) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, requires_grad, + torch_tensor.shape, ff_dtype, True, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) + input_tensors.append(ff_tensor) return ff_tensor @@ -2398,7 +2485,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, name=self.name, + input=logits, last_layer=True, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2440,6 +2527,11 @@ def _trace_model(self): batch_size=self.batch_size, sequence_length=self.seq_length, ) + + #import pickle + #with open('symbolic_trace', 'rb') as f: + #traced = pickle.load(f) + else: traced = torch.fx.symbolic_trace(self.model) @@ -2527,6 +2619,8 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None + elif isinstance(node, AttributeNode): + node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 22ad739dd..1d8634f22 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -178,6 +178,11 @@ void flexflow_model_update(flexflow_model_t handle_) { handle->update(); } +void flexflow_model_unified_update(flexflow_model_t handle_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + handle->unified_update(); +} + void flexflow_model_compile(flexflow_model_t handle_, enum LossType loss_type, int *metrics, @@ -568,8 +573,8 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = - handle->layer_norm(input, axes_vec, elementwise_affine, eps, name); + Tensor tensor = handle->layer_norm( + input, axes_vec, elementwise_affine, eps, input->data_type, name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -734,10 +739,11 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, const flexflow_tensor_t input_, int dim, + bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, name); + Tensor tensor = handle->softmax(input, dim, last_layer, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); diff --git a/src/dataloader/dataloader.cc b/src/dataloader/dataloader.cc index 441a08819..614482e8b 100644 --- a/src/dataloader/dataloader.cc +++ b/src/dataloader/dataloader.cc @@ -97,7 +97,7 @@ SingleDataLoader::SingleDataLoader(FFModel &ff, datatype = datatype_; // Currently assume that the leading dim of input is a replica dim of degree 1 assert(input->dims[input->num_dims - 1].is_replica_dim); - assert(input->dims[input->num_dims - 1].size == 1); + // assert(input->dims[input->num_dims - 1].size == 1); batch_input = input; ParallelDim dims[MAX_TENSOR_DIM]; diff --git a/src/dataloader/dataloader.cpp b/src/dataloader/dataloader.cpp index 7d9ffc02b..97668d705 100644 --- a/src/dataloader/dataloader.cpp +++ b/src/dataloader/dataloader.cpp @@ -41,10 +41,12 @@ void SingleDataLoader::load_input(Task const *task, int num_dims = full_input_domain.get_dim(); assert(num_dims + 1 == batch_input_domain.get_dim()); // assert the leading replica dim has a degree of one - assert(batch_input_domain.hi()[num_dims] == - batch_input_domain.lo()[num_dims]); + // assert(batch_input_domain.hi()[num_dims] == + // batch_input_domain.lo()[num_dims]); coord_t batch_size = batch_input_domain.hi()[num_dims - 1] - batch_input_domain.lo()[num_dims - 1] + 1; + coord_t replicate_num = + batch_input_domain.hi()[num_dims] - batch_input_domain.lo()[num_dims] + 1; coord_t num_elements_per_batch = batch_input_domain.get_volume() / batch_size; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); @@ -61,13 +63,15 @@ void SingleDataLoader::load_input(Task const *task, // printf("ptr(%p, %p), idx0 %d nb_elements_per_batch %d, batch_size %d, // %d\n", acc_full_input.ptr, input_zc, start_idx, num_elements_per_batch, // batch_size, start_idx * num_elements_per_batch); - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel
), + assert(batch_input_domain.get_volume() % replicate_num == 0); + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel_with_replicate
), GET_BLOCKS(batch_input_domain.get_volume()), CUDA_NUM_THREADS, 0, stream, batch_input_ptr, input_zc, + batch_input_domain.get_volume() / replicate_num, batch_input_domain.get_volume()); checkCUDA(hipDeviceSynchronize()); } diff --git a/src/dataloader/dataloader.cu b/src/dataloader/dataloader.cu index c2994d00a..5462532d7 100644 --- a/src/dataloader/dataloader.cu +++ b/src/dataloader/dataloader.cu @@ -40,10 +40,13 @@ void SingleDataLoader::load_input(Task const *task, int num_dims = full_input_domain.get_dim(); assert(num_dims + 1 == batch_input_domain.get_dim()); // assert the leading replica dim has a degree of one - assert(batch_input_domain.hi()[num_dims] == - batch_input_domain.lo()[num_dims]); + // assert(batch_input_domain.hi()[num_dims] == + // batch_input_domain.lo()[num_dims]); coord_t batch_size = batch_input_domain.hi()[num_dims - 1] - batch_input_domain.lo()[num_dims - 1] + 1; + + coord_t replicate_num = + batch_input_domain.hi()[num_dims] - batch_input_domain.lo()[num_dims] + 1; coord_t num_elements_per_batch = batch_input_domain.get_volume() / batch_size; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); @@ -60,11 +63,15 @@ void SingleDataLoader::load_input(Task const *task, // printf("ptr(%p, %p), idx0 %d nb_elements_per_batch %d, batch_size %d, // %d\n", acc_full_input.ptr, input_zc, start_idx, num_elements_per_batch, // batch_size, start_idx * num_elements_per_batch); - copy_kernel
+ assert(batch_input_domain.get_volume() % replicate_num == 0); + copy_kernel_with_replicate
<<>>(batch_input_ptr, input_zc, batch_input_domain.get_volume()); + stream>>>(batch_input_ptr, + input_zc, + batch_input_domain.get_volume() / replicate_num, + batch_input_domain.get_volume()); checkCUDA(cudaDeviceSynchronize()); } diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index ae89c3d46..d887ee924 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,6 +49,8 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); + } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { + scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -131,9 +133,12 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - int num_samples = - acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - int num_classes = acc_logit.rect.volume() / num_samples; + + int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; + int num_samples = acc_logit.rect.volume() / num_classes; + // int num_samples = + // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + // int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp index a87aaade8..3453f3fbf 100644 --- a/src/loss_functions/loss_functions.cpp +++ b/src/loss_functions/loss_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { using namespace Legion; +int const MASK_TOKEN = -100; __global__ void sparse_categorical_crossentropy_loss_backward(float *logit_grad, @@ -33,6 +34,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -75,8 +95,14 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), hipMemcpyDeviceToDevice)); - hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward, - GET_BLOCKS(num_samples), + + assert(scale_factor == 1.0f); + float *num; + checkCUDA(hipMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward_with_mask, + GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream, @@ -84,7 +110,10 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( label_ptr, num_samples, num_classes, - k); + k, + num); + + hipMemcpy(&effective_tokens, num, sizeof(float), hipMemcpyDeviceToHost); // Scale logit gradients by op->scale_factor hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), @@ -94,7 +123,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, - scale_factor * k); + 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index f78311980..edd8f03fa 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,6 +18,7 @@ namespace FlexFlow { +int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -32,6 +33,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -74,14 +94,25 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - sparse_categorical_crossentropy_loss_backward<<>>( - logit_grad_ptr, label_ptr, num_samples, num_classes, k); - // Scale logit gradients by op->scale_factor + // calculate the scale factor inside kernel; + assert(scale_factor == 1.0f); + float *num; + checkCUDA(cudaMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + // sparse_categorical_crossentropy_loss_backward<<>>( + // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + sparse_categorical_crossentropy_loss_backward_with_mask<<< + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0, 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index e8ccbfe2e..8c7e23ad8 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -15,6 +15,7 @@ #include "flexflow/metrics_functions.h" #include "flexflow/model.h" +#include namespace FlexFlow { @@ -90,6 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); + // std::cout << "logit shape: " << logit->get_shape() << std::endl; + // std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -154,6 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead + // std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cpp b/src/metrics_functions/metrics_functions.cpp index 90d727b9b..1c57bd6ba 100644 --- a/src/metrics_functions/metrics_functions.cpp +++ b/src/metrics_functions/metrics_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -30,7 +31,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -39,14 +40,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index 2e037eb47..8c584c397 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,6 +19,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -29,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -38,14 +39,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 55f673082..2ebfaff53 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -28,7 +28,7 @@ using PCG::Node; using namespace FlexFlow::Kernels::Dropout; -Tensor FFModel::dropout(const Tensor input, +Tensor FFModel::dropout(Tensor const input, float rate, unsigned long long seed, char const *name) { @@ -86,7 +86,7 @@ bool operator==(DropoutParams const &lhs, DropoutParams const &rhs) { } Dropout::Dropout(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, float _rate, unsigned long long _seed, char const *name) @@ -111,12 +111,12 @@ Dropout::Dropout(FFModel &model, Dropout::Dropout(FFModel &model, Dropout const &other, - const ParallelTensor input) + ParallelTensor const input) : Dropout(model, input, other.rate, other.seed, other.name) {} Dropout::Dropout(FFModel &model, DropoutParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Dropout(model, input, params.rate, params.seed, name) {} @@ -210,12 +210,12 @@ void Dropout::forward_task(Task const *task, assert(task->regions.size() == 2); // const Dropout* dropout = (const Dropout*) task->args; DropoutMeta *m = *((DropoutMeta **)task->local_args); - float const *input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - - forward_kernel_wrapper(m, input_ptr, output_ptr); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + forward_kernel_wrapper(m, input, output); } void Dropout::backward(FFModel const &ff) { @@ -264,7 +264,13 @@ void Dropout::backward_task(Task const *task, float const *output_grad_ptr = helperGetTensorPointerRO( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + backward_kernel_wrapper(m, output_grad, input_grad); } void Dropout::serialize(Legion::Serializer &sez) const { @@ -304,30 +310,36 @@ bool Dropout::measure_operator_cost(Simulator *sim, sim->free_all(); float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(input_ptr != NULL); + + GenericTensorAccessorR input_acc(m->input_type[0], sub_input.get_domain(), input_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_ptr != NULL); + + GenericTensorAccessorW output_acc(m->output_type[0], sub_input.get_domain(), output_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); assert(m->profiling == false); std::function forward, backward; - forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); }; + forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { float *input_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(input_grad_ptr != NULL); + GenericTensorAccessorW input_grad_acc(m->output_type[0], sub_input.get_domain(), input_grad_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + GenericTensorAccessorR output_grad_acc(m->output_type[0], sub_input.get_domain(), output_grad_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr); + backward_kernel_wrapper(m, output_grad_acc, input_grad_acc); }; } diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index b90e85588..84c3f8ba9 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -211,6 +211,9 @@ ElementBinary::ElementBinary(FFModel &model, numdim, dims, in1->data_type, this); broadcast_input1 = (inputs[0]->get_volume() != outputs[0]->get_volume()); broadcast_input2 = (inputs[1]->get_volume() != outputs[0]->get_volume()); + + batch_size = dims[numdim - 2].size; + } ElementBinary::ElementBinary( @@ -337,6 +340,8 @@ OpMeta *ElementBinary::init_task(Task const *task, m->has_same_operands = eb->has_same_operands; m->broadcast_input1 = eb->broadcast_input1; m->broadcast_input2 = eb->broadcast_input2; + m->batch_size = eb->batch_size; + std::strcpy(m->op_name, eb->name); Domain input1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -368,6 +373,10 @@ OpMeta *ElementBinary::init_task(Task const *task, } else { output_domain = input1_domain; } + m->replicate_size = m->broadcast_input1 + ? (input1_domain.get_volume() / m->batch_size) + : (input2_domain.get_volume() / m->batch_size); + assert(task->regions.size() == regions.size()); assert(regions.size() == num_regions); init_kernel(m, input1_domain, input2_domain, output_domain); @@ -381,7 +390,7 @@ void ElementBinary::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(ElementBinary)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -448,7 +457,7 @@ __host__ void std::vector const ®ions, Context ctx, Runtime *runtime) { - // const ElementBinary* ele = (const ElementBinary*) task->args; + ElementBinary const *ele = (ElementBinary const *)task->args; ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); Domain in1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -793,10 +802,32 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, return true; } +void ElementBinary::serialize(Legion::Serializer &sez) const { + sez.serialize(this->op_type); + sez.serialize(this->inplace_a); +} + +using PCG::Node; +/*static*/ +Node ElementBinary::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + OperatorType op_type; + bool inplace_a; + dez.deserialize(op_type); + dez.deserialize(inplace_a); + ElementBinaryParams params; + params.type = op_type; + params.inplace_a = inplace_a; + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} ElementBinaryParams ElementBinary::get_params() const { ElementBinaryParams params; params.type = this->op_type; + params.inplace_a = this->inplace_a; return params; } diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index f99d9f749..46643b655 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -131,7 +131,8 @@ Tensor FFModel::tanh(const Tensor x, char const *name) { } Tensor FFModel::identity(const Tensor x, char const *name) { - return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + // return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + return x; } Tensor FFModel::gelu(const Tensor x, char const *name) { diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index 43c84b0c4..38c604329 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -189,8 +189,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * erfc(-input[i] * M_SQRT1_2) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index d6e5bcfdc..187e60282 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -202,8 +202,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * erfc(-input[i] * M_SQRT1_2) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 3b53213b9..8df632446 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -155,11 +155,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } else { int num_dims = input->num_dims; @@ -170,11 +167,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); @@ -189,13 +183,13 @@ int Embedding::weight_size(ParallelDim weight_dims[MAX_TENSOR_DIM]) { weight_dims[Weight::VOCAB_SIZE].size = this->num_entries; weight_dims[Weight::VOCAB_SIZE].degree = 1; weight_dims[Weight::VOCAB_SIZE].parallel_idx = -1; - for (int i = 2; i < input->num_dims; i++) { + for (int i = 2; i < input->num_dims + 1; i++) { weight_dims[i].size = input->dims[i - 1].degree; weight_dims[i].degree = weight_dims[i].size; weight_dims[i].parallel_idx = input->dims[i - 1].parallel_idx; weight_dims[i].is_replica_dim = true; } - return input->num_dims; + return input->num_dims + 1; } void Embedding::register_output_mappings() { diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 3dc442708..b241ff158 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -129,9 +129,11 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { // op->name, op_config)); // Cannot fuse parallel operators since they have different paralel_is // in forward and backward - assert(!op->is_parallel_op()); + assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE); // Currently don't consider nested fusion - assert(op->op_type != OP_FUSED); + if (op->op_type == OP_FUSED) { + return false; + } MachineView my_view = outputs[0]->machine_view; MachineView op_view = op->outputs[0]->machine_view; if (my_view == op_view) { diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index a602c5d6b..9da93f0c6 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -17,16 +17,22 @@ #include "flexflow/model.h" #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" +#include "flexflow/ops/embedding.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/ops/kernels/element_binary_kernels.h" +#include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/utils/hip_helper.h" #include @@ -200,9 +206,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_LINEAR: { @@ -281,8 +285,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper( m, @@ -290,12 +294,81 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[1].get_float_ptr(), my_output_accessor[0].get_float_ptr()); break; + } + + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(my_input_accessor[0].domain.get_dim() + 1 == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i + 1]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i + 1]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } else { + assert(my_input_accessor[0].domain.get_dim() == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); + Kernels::Embedding::forward_kernel_wrapper(m, + my_input_accessor[0], + my_output_accessor[0], + my_weight_accessor[0], + in_dim, + out_dim, + effective_batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -331,16 +404,55 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); break; } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } else { + assert(false); + } + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); + break; + } case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } break; } case OP_TRANSPOSE: { @@ -358,6 +470,42 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -675,8 +823,9 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == + // my_input_accessor[1].domain); assert(my_input_accessor[0].domain + // == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::backward_kernel_wrapper( m, @@ -687,10 +836,50 @@ __host__ void FusedOp::backward_task(Task const *task, my_input_grad_accessor[1].get_float_ptr()); break; } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + Kernels::Embedding::backward_kernel_wrapper(m, + my_input_accessor[0], + my_output_grad_accessor[0], + my_weight_grad_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -709,7 +898,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; Kernels::Pool2D::backward_kernel_wrapper( m, @@ -737,10 +927,51 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_grad_accessor[0].domain.get_volume() == my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::backward_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; } case OP_TRANSPOSE: { @@ -758,6 +989,46 @@ __host__ void FusedOp::backward_task(Task const *task, my_output_grad_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } default: assert(false && "Fusion currently does not support type"); } @@ -767,13 +1038,16 @@ __host__ void FusedOp::backward_task(Task const *task, assert(ooff == 0); // for (int i = 0; i < fused->numWeights; i++) // print_tensor(weight_grad_ptr[i], - // weight_grad_domain[i].get_volume(), "[Fused:backward:weight_grad]"); + // weight_grad_domain[i].get_volume(), + // "[Fused:backward:weight_grad]"); // for (int i = 0; i < fused->numInputs; i++) - // print_tensor(input_grad_ptr[i], input_grad_domain[i].get_volume(), + // print_tensor(input_grad_ptr[i], + // input_grad_domain[i].get_volume(), // "[Fused:backward:input_grad]"); // for (int i = 0; i < fused->numOutputs; i++) // print_tensor(output_grad_ptr[i], - // output_grad_domain[i].get_volume(), "[Fused:backward:output_grad]"); + // output_grad_domain[i].get_volume(), + // "[Fused:backward:output_grad]"); } }; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index ca2a33198..b78447ba4 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -21,6 +21,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/fused.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -30,7 +31,10 @@ #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -213,9 +217,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_LINEAR: { @@ -294,8 +296,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper( m, @@ -358,7 +360,8 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); } - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); Kernels::Embedding::forward_kernel_wrapper(m, my_input_accessor[0], my_output_accessor[0], @@ -368,10 +371,15 @@ __host__ void FusedOp::forward_task(Task const *task, effective_batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -414,10 +422,51 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } else { + assert(false); + } + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_TRANSPOSE: { @@ -427,6 +476,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); Kernels::Transpose::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), @@ -435,6 +486,43 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -738,9 +826,7 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr()); + m, my_output_grad_accessor[0], my_input_grad_accessor[0]); break; } case OP_EW_ADD: @@ -752,8 +838,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::backward_kernel_wrapper( m, @@ -769,7 +855,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); int in_dim, out_dim, effective_batch_size; if (m->aggr == AGGR_MODE_NONE) { in_dim = 1; @@ -830,10 +917,15 @@ __host__ void FusedOp::backward_task(Task const *task, batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -880,10 +972,51 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_grad_accessor[0].domain.get_volume() == my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::backward_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; } case OP_TRANSPOSE: { @@ -901,6 +1034,47 @@ __host__ void FusedOp::backward_task(Task const *task, my_output_grad_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + + break; + } default: assert(false && "Fusion currently does not support type"); } diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp index b0dd4c644..c0d574846 100644 --- a/src/ops/kernels/dropout_kernels.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -30,6 +30,11 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler) { profiling = dropout->profiling; + rate = dropout->rate; + seed = dropout->seed; + input_type[0] = dropout->data_type; + output_type[0] = dropout->data_type; + checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateDropoutDescriptor(&dropoutDesc)); @@ -78,20 +83,68 @@ DropoutMeta::~DropoutMeta(void) { namespace Kernels { namespace Dropout { +__global__ void dropout_forward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, i, 0, &state); + float rand = hiprand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + +__global__ void dropout_backward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, i, 0, &state); + float rand = hiprand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + + Internal::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + input.domain.get_volume(), + stream); + + // printf("dropout %d\n", input.domain.get_volume()); + // assert(false); } void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); + Internal::backward_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); } namespace Internal { @@ -99,35 +152,58 @@ namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + int parallelism = num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(dropout_forward_kernel), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->seed, + m->rate, + num_elements, + input_ptr, + output_ptr); - checkCUDNN(miopenDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + // checkCUDNN(miopenDropoutForward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor /* not used */, + // m->inputTensor, + // input_ptr, + // m->outputTensor, + // output_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - - checkCUDNN(miopenDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + int parallelism = num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(dropout_backward_kernel), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->seed, + m->rate, + num_elements, + output_grad_ptr, + input_grad_ptr); + // checkCUDNN(miopenDropoutBackward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor /* not used */, + // m->outputTensor, + // output_grad_ptr, + // m->inputTensor, + // input_grad_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } } // namespace Internal diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index 4a76301fd..c5b1a384d 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -29,6 +29,10 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler) { profiling = dropout->profiling; + rate = dropout->rate; + seed = dropout->seed; + input_type[0] = dropout->data_type; + output_type[0] = dropout->data_type; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc)); @@ -74,20 +78,97 @@ DropoutMeta::~DropoutMeta(void) { namespace Kernels { namespace Dropout { +__global__ void dropout_forward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + curandStatePhilox4_32_10_t state; + curand_init(seed, i, 0, &state); + float rand = curand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + +__global__ void dropout_backward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + curandStatePhilox4_32_10_t state; + curand_init(seed, i, 0, &state); + float rand = curand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + Internal::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + input.domain.get_volume(), + stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf(" [dropout] forward time = %.2lfms\n", elapsed); + } } void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + Internal::backward_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf(" [dropout] backward time = %.2lfms\n", elapsed); + } } namespace Internal { @@ -95,33 +176,48 @@ namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - checkCUDNN(cudnnDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + int parallelism = num_elements; + dropout_forward_kernel<<>>( + m->seed, m->rate, num_elements, input_ptr, output_ptr); + + // checkCUDNN(cudnnDropoutForward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor, + // input_ptr, + // m->outputTensor, + // output_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + int parallelism = num_elements; + dropout_backward_kernel<<>>( + m->seed, m->rate, num_elements, output_grad_ptr, input_grad_ptr); - checkCUDNN(cudnnDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + // checkCUDNN(cudnnDropoutBackward(m->handle.dnn, + // m->dropoutDesc, + // m->outputTensor, + // output_grad_ptr, + // m->inputTensor, + // input_grad_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } } // namespace Internal diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index 4cdc839b5..325edba6d 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -72,15 +72,12 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, float *out_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { hipEventCreate(&t_start); hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - // print_tensor(in1_ptr, in1_domain.get_volume(), "input1:"); - // print_tensor(in2_ptr, in2_domain.get_volume(), "input2:"); Internal::forward_kernel(m, in1_ptr, in2_ptr, out_ptr, stream); // print_tensor(out_ptr, in1_domain.get_volume(), "output:"); if (m->profiling) { @@ -199,6 +196,21 @@ __global__ void elewise_binary_forward_kernel(coord_t volume, } } +// for simplicity, assume the replicate dimension is the batchsize +__global__ void + elewise_binary_forward_kernel_broadcast2(float const *in1_ptr, + float const *in2_ptr, + float *output_ptr, + size_t volume, + size_t batch_size, + size_t replicate_size) { + CUDA_KERNEL_LOOP(i, volume) { + size_t batch = i / replicate_size; + output_ptr[i] = + in1_ptr[i] + in2_ptr[batch * replicate_size + i % replicate_size]; + } +} + __global__ void elewise_binary_backward_kernel(coord_t volume, float const alpha, float const beta, @@ -245,7 +257,6 @@ void forward_kernel(ElementBinaryMeta const *m, hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - float alpha1 = 1.0f, alpha2 = 1.0f, beta = 0.0f; switch (m->op_type) { case OP_EW_SUB: @@ -284,6 +295,19 @@ void forward_kernel(ElementBinaryMeta const *m, &alpha1, m->outputTensor, out_ptr)); + } else if (m->op_type == OP_EW_ADD && m->broadcast_input2) { + int parallelism = m->batch_size * m->replicate_size; + hipLaunchKernelGGL(elewise_binary_forward_kernel_broadcast2, + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream, + in1_ptr, + in2_ptr, + out_ptr, + m->batch_size * m->replicate_size, + m->batch_size, + m->replicate_size); } else { checkCUDNN(miopenOpTensor(m->handle.dnn, m->opDesc, diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index d63bd0edc..6df6351bb 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -27,8 +27,11 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + // checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN( + cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -67,6 +70,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -78,7 +82,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, hipEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { hipEventRecord(t_end, stream); checkCUDA(hipEventSynchronize(t_end)); @@ -114,15 +118,32 @@ void forward_kernel(SoftmaxMeta const *m, MIOPEN_SOFTMAX_MODE_CHANNEL)); } -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, hipStream_t stream) { - checkCUDA(hipMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - hipMemcpyDeviceToDevice, - stream)); + if (m->last_layer) { + checkCUDA(hipMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } } } // namespace Internal diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index d83d9952c..e163c9a0c 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,8 +26,10 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + inputTensor, input_domain, softmax->data_type)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -66,6 +68,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -77,7 +80,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -113,15 +116,33 @@ void forward_kernel(SoftmaxMeta const *m, output_ptr)); } -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); + + if (m->last_layer) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr)); + } } } // namespace Internal diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index a6928e7d1..ccbd7c2dd 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -61,10 +61,27 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type, char const *name) { - // FIXME: currently disable elementwise_affine - elementwise_affine = false; - // axes must be the last axes.size() dimensions + // In PyTorch, axes must be the sizes of the last axes.size() dimensions of + // the input tensor. However, since the tensor dimensions are reversed in + // FlexFlow (batch size is the last dimension), we require that axes must be + // the sizes of the FIRST axes.size() dimensions of the input tensor. + + // Another difference is that in PyTorch, the axes vector should contain the + // sizes of the dimensions with respect to which you want to compute the + // layernorm. In FlexFlow, instead, axes should contain the INDICES of the + // dimensions in question. We do this because the size of a dimension might be + // different when splitting a tensor in model parallelism. + assert( + axes.size() <= input->num_dims && + "number of axes must be less than tensor dimensions"); // input does not + // have replica + // dimension here + for (int i = 0; i < axes.size(); i++) { + assert(axes[i] == i && "axes must be the first axes.size() dimensions"); + } +#ifdef DEADCODE for (int i = 0; i < axes.size(); i++) { bool found = false; for (int j = 0; j < axes.size(); j++) { @@ -76,15 +93,33 @@ Tensor FFModel::layer_norm(const Tensor input, assert(false && "axes must be the last axes.size() dimensions"); } } +#endif + if (data_type == DT_NONE) { + data_type = input->data_type; + } int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = new Layer(this, - OP_LAYERNORM, - DT_FLOAT, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); + Layer *ln = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + casted_input); + } else { + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + input); + } + ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, input->dims, input->data_type, @@ -92,19 +127,19 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights == 2) { - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= input->dims[input->num_dims - 1 - axes[i]]; + int numdims = axes.size(); + int dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[axes[i]]; } - int dims[1] = {M}; - ln->weights[0] = create_weight_legion_ordering(1, + ln->weights[0] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(1, + ln->weights[1] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, @@ -179,19 +214,45 @@ LayerNorm::LayerNorm(FFModel &model, ParallelDim output_dims[MAX_TENSOR_DIM]; int M = 1; for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; + M *= inputs[0]->dims[axes[i]].size; } effective_num_elements = M; effective_batch_size = inputs[0]->get_volume() / M; + assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - int kernel_dims = 2; - assert(false); - // weights[0] = model.create_parallel_weight_legion_ordering( - // kernel_dims, - } else { - // do nothing + ParallelDim dims[axes.size() + 1]; + int num_dims = axes.size(); + for (int i = 0; i < num_dims; i++) { + dims[i] = inputs[0]->dims[i]; + } + assert(numInputs == 1); + dims[num_dims].degree = inputs[0]->dims[inputs[0]->num_dims - 1].degree; + dims[num_dims].size = dims[num_dims].degree; + dims[num_dims].parallel_idx = + inputs[0]->dims[inputs[0]->num_dims - 1].parallel_idx; + dims[num_dims].is_replica_dim = true; + num_dims += 1; + + int seed = std::rand(); + Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + weights[0] = + model.create_parallel_weight_legion_ordering(num_dims, + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + weights[1] = + model.create_parallel_weight_legion_ordering(num_dims, + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); } - return; } void LayerNorm::init(FFModel const &ff) { @@ -221,6 +282,20 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -233,6 +308,8 @@ OpMeta *LayerNorm::init_task(Task const *task, LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); LayerNormMeta *meta = new LayerNormMeta(handle, ln); + meta->input_type[0] = ln->inputs[0]->data_type; + meta->output_type[0] = ln->outputs[0]->data_type; return meta; } @@ -292,35 +369,51 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorR in; + GenericTensorAccessorW out, gamma, beta; + Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); + // in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + in = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); + // out_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + out = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + // gamma_ptr = helperGetTensorPointerRW( + // regions[2], task->regions[2], FID_DATA, ctx, runtime); + gamma = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - beta_ptr = helperGetTensorPointerRW( - regions[3], task->regions[3], FID_DATA, ctx, runtime); + // beta_ptr = helperGetTensorPointerRW( + // regions[3], task->regions[3], FID_DATA, ctx, runtime); + beta = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim() - 1; + for (int i = 0; i < numdims; i++) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + } } else { assert(regions.size() == 2); } - - LayerNorm::forward_kernel_wrapper( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr); + LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } void LayerNorm::backward(FFModel const &ff) { @@ -412,8 +505,8 @@ void LayerNorm::backward_task(Task const *task, in_grad_ptr = helperGetTensorPointerRW( regions[2], task->regions[2], FID_DATA, ctx, runtime); assert(in_domain == out_grad_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); if (m->elementwise_affine) { assert(regions.size() == 6); Domain gamma_domain = runtime->get_index_space_domain( @@ -454,19 +547,26 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, if (!inputs[0]->get_sub_tensor(mv, sub_input)) { return false; } + Domain input_domain = sub_input.get_domain(); + Domain output_domain = sub_output.get_domain(); LayerNormMeta *m = new LayerNormMeta(sim->handler, this); sim->free_all(); float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(in_ptr != NULL); + GenericTensorAccessorR input1_acc(inputs[0]->data_type, input_domain, in_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(out_ptr != NULL); + GenericTensorAccessorW output_acc( + outputs[0]->data_type, output_domain, out_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); // FIXME please add gamma_ptr and beta_ptr after finish the implementation float *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorR gamma_acc; + GenericTensorAccessorR beta_acc; bool out_of_memory = (in_ptr == NULL) || (out_ptr == NULL) || @@ -479,7 +579,7 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, std::function forward, backward; forward = [&] { - forward_kernel_wrapper(m, in_ptr, out_ptr, gamma_ptr, beta_ptr); + forward_kernel_wrapper(m, input1_acc, output_acc, gamma_acc, beta_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index c3030e20b..8ea2ebba9 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -30,12 +31,26 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; eps = ln->eps; - checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + + DataType data_type = ln->data_type; + checkCUDA( + hipMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -43,12 +58,10 @@ __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) { -#if 0 #ifndef __HIP_PLATFORM_HCC__ - return __shfl_down_sync(mask, value, delta, width); + return __shfl_down_sync(mask, value, delta, width); #else - return __shfl_down(value, delta, width); -#endif + return __shfl_down(value, delta, width); #endif } @@ -79,26 +92,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -129,10 +142,10 @@ template void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *in_ptr, T *out_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, hipStream_t stream) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), m->effective_batch_size, kCUDABlockReduceNumThreads, 0, @@ -140,33 +153,50 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, m->effective_num_elements, m->eps, in_ptr, - m->mean_ptr, - m->rstd_ptr); - hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), m->effective_batch_size, kCUDANumThreads, 0, stream, m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + // LayerNorm::forward_kernel( + // m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } } template @@ -346,6 +376,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -367,8 +473,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, output_grad_ptr, input_ptr, gamma_ptr, - m->ds_ptr, - m->db_ptr); + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), B, @@ -377,12 +483,29 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, stream, M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -396,8 +519,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -414,8 +537,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } @@ -443,11 +566,12 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, stream); } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); +// template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const +// *m, +// float const *in_ptr, +// float *out_ptr, +// float *gamma_ptr, +// float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index ac477ba2a..736d12251 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -30,12 +31,19 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; eps = ln->eps; - checkCUDA(cudaMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + DataType data_type = ln->data_type; + checkCUDA( + cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -77,26 +85,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -127,30 +135,33 @@ template void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *in_ptr, T *out_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel + RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, m->eps, in_ptr, m->mean_ptr, m->rstd_ptr); - LayerNormForwardCUDAKernel + m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + LayerNormForwardCUDAKernel <<effective_batch_size, kCUDANumThreads, 0, stream>>>( m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -160,8 +171,24 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -170,8 +197,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); - print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); - print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } } @@ -352,6 +379,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -366,17 +469,34 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, const int64_t N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( - N, output_grad_ptr, input_ptr, gamma_ptr, m->ds_ptr, m->db_ptr); + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -386,8 +506,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -396,14 +516,15 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; GammaBetaBackwardCUDAKernel - <<>>(M, - N, - output_grad_ptr, - input_ptr, - m->mean_ptr, - m->rstd_ptr, - gamma_grad_ptr, - beta_grad_ptr); + <<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -419,21 +540,18 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); + } } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, @@ -443,4 +561,4 @@ template void float *gamma_grad_ptr, float *beta_grad_ptr); -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 75c51b41b..20cc19310 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -190,6 +190,23 @@ Linear::Linear(FFModel &model, params.construct_mappings(*this->parallel_dims_mapping, input_shape); params.solve_dims(input_shape, output_shape, kernel_shape, bias_shape); + kernel_shape.dims[0].size = this->in_channels; + bias_shape.dims[0].degree = _input->dims[_input->num_dims - 1].degree; + bias_shape.dims[0].parallel_idx = + _input->dims[_input->num_dims - 1].parallel_idx; + bias_shape.dims[1].size = bias_shape.dims[1].degree = 1; + bias_shape.dims[1].parallel_idx = -1; + bias_shape.dims[bias_shape.num_dims - 1].size = + bias_shape.dims[bias_shape.num_dims - 1].degree = 1; + for (int i = 0; i < input_shape.num_dims - 1; i++) { + if (_input->dims[i].degree > 1) { + bias_shape.dims[bias_shape.num_dims - 1].size *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].degree *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].parallel_idx = + _input->dims[i].parallel_idx; + } + } + if (allocate_weights) { Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); @@ -220,7 +237,6 @@ Linear::Linear(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( output_shape.num_dims, output_shape.dims, _data_type, this); - assert(check_output_input_weight_parallel_dims(allocate_weights)); } void Linear::init(FFModel const &ff) { @@ -433,11 +449,11 @@ void Linear::forward_task_with_dim(Task const *task, int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; int batch_size = acc_output.rect.volume() / out_dim; assert(acc_output.rect.volume() == static_cast(out_dim * batch_size)); - assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); + // assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); float const *acc_bias_ptr = NULL; if (m->use_bias) { - TensorAccessorR acc_bias( + TensorAccessorR acc_bias( regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(acc_bias.rect.volume() == static_cast(out_dim)); acc_bias_ptr = acc_bias.ptr; @@ -623,12 +639,12 @@ void Linear::backward_task_with_dim(Task const *task, static_cast(in_dim * out_dim)); float *acc_bias_grad_ptr = NULL; if (m->use_bias) { - TensorAccessorW acc_bias_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); + TensorAccessorW acc_bias_grad(regions[rid], + task->regions[rid], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); rid++; assert(acc_bias_grad.rect.volume() == static_cast(out_dim)); acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 2b8a60bf2..07797bd22 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -80,9 +80,14 @@ Op *Reshape::create_operator_from_layer( return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name); } +bool match_pattern(std::vector const &_shape) { + return (_shape.size() == 4 && _shape[1] == 1 && _shape[2] == 1 && + _shape[3] == 512); +} + Reshape::Reshape(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor input, + ParallelTensor const input, std::vector const &_shape, char const *name) : Op(model, @@ -106,19 +111,64 @@ Reshape::Reshape(FFModel &model, if (input->dims[i].is_replica_dim) { num_replica_dims++; } + // std::cout << "reshape input size: " << input->dims[i].size + // << ", parallelidx: " << input->dims[i].parallel_idx << ". degree: " << input->dims[i].degree + // << "is replicate dim: " << input->dims[i].is_replica_dim << + // "\n"; } + + // assert(false); // assert that all replica dims are leading dims for (int i = 0; i < num_replica_dims; i++) { assert(input->dims[input->num_dims - 1 - i].is_replica_dim); } int numdim = (int)_shape.size(); ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i].size = _shape[numdim - 1 - i]; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = false; - } + + + bool expanded = numdim >= input->num_dims; + bool aggregation = numdim < input->num_dims - 1; + + for (int i = 0; i < numdim; i++) { + if (expanded && i < numdim - 1 && + _shape[i] * _shape[i + 1] == input->dims[numdim - i - 2].size) { + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = input->dims[numdim - i - 2].degree; + dims[numdim - i - 1].parallel_idx = + input->dims[numdim - i - 2].parallel_idx; + dims[numdim - i - 1].is_replica_dim = + input->dims[numdim - i - 2].is_replica_dim; + std::cout << "expand dim i:" << i << ", " << dims[numdim - i - 1].degree + << ", " << dims[numdim - i - 1].size << "\n"; + } else if (aggregation && + (_shape[i] == input->dims[input->num_dims - 2 - i].size * + input->dims[input->num_dims - 3 - i].size)) { + // inherit + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = + input->dims[input->num_dims - 2 - i].degree; + dims[numdim - i - 1].parallel_idx = + input->dims[input->num_dims - 2 - i].parallel_idx; + dims[numdim - i - 1].is_replica_dim = + input->dims[input->num_dims - 2 - i].is_replica_dim; + // std::cout << "agree i: " << i <<", " << _shape[i] << "\n"; + } else { + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = 1; + dims[numdim - i - 1].parallel_idx = -1; + dims[numdim - i - 1].is_replica_dim = false; + } + } + + + + + // for (int i = 0; i < numdim; i++) { + // dims[i].size = _shape[numdim - 1 - i]; + // dims[i].degree = 1; + // dims[i].parallel_idx = -1; + // dims[i].is_replica_dim = false; + // } // copy all replica dims for (int i = 0; i < num_replica_dims; i++) { dims[i + numdim] = input->dims[input->num_dims - 1 - i]; @@ -131,6 +181,24 @@ Reshape::Reshape(FFModel &model, } dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i]; } + + //TODO temporary fix for input to attention QK, fix it after fuse the attention block + if(match_pattern(_shape) && model.config.tensor_parallelism_degree > 1){ + //number of heads + + dims[2].size = 12; + dims[2].degree = model.config.tensor_parallelism_degree; + dims[2].parallel_idx = 0; + dims[2].is_replica_dim = true; + + dims[4].size = 1; + dims[4].degree = 1; + dims[4].parallel_idx = -1; + dims[4].is_replica_dim = false; + + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, input->data_type, this); assert(outputs[0]->get_volume() == inputs[0]->get_volume()); diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 029b20afd..ab65db542 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,7 +52,10 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { +Tensor FFModel::softmax(const Tensor _input, + int dim, + bool last_layer, + char const *name) { Layer *sm = new Layer(this, OP_SOFTMAX, DT_FLOAT, @@ -69,6 +72,8 @@ Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_FLOAT, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); + + sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -80,15 +85,19 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; + layer->get_int_property("last_layer", value); + bool last_layer = (bool)value; return new Softmax(model, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, + last_layer, layer->name); } Softmax::Softmax(FFModel &model, const ParallelTensor _input, int _dim, + bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -98,7 +107,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim) { + dim(_dim), last_layer(_last_layer) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); ParallelDim dims[MAX_TENSOR_DIM]; @@ -113,7 +122,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, name) {} + : Softmax(model, input, params.dim, params.last_layer, name) {} void Softmax::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -283,6 +292,13 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -315,8 +331,8 @@ void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3); + assert(task->regions.size() == 3); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); TensorAccessorW acc_input_grad(regions[0], @@ -327,11 +343,16 @@ void Softmax::backward_task_with_dim(Task const *task, true /*readOutput*/); TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); + TensorAccessorR acc_output( + regions[2], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); + backward_kernel_wrapper(m, + acc_input_grad.ptr, + acc_output_grad.ptr, + acc_output.ptr, + acc_input_grad.rect.volume()); } bool Softmax::get_int_parameter(PMParameter para, int *value) const { @@ -377,11 +398,17 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + float *output_ptr = + (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper( - m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); + backward_kernel_wrapper(m, + input_grad_ptr, + output_grad_ptr, + output_ptr, + sub_output.get_volume()); }; } @@ -413,6 +440,7 @@ size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; hash_combine(key, params.dim); + hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/ops/split.cc b/src/ops/split.cc index 4f60cb96f..351785294 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -330,6 +330,17 @@ void Split::backward_task(Task const *task, split->numOutputs); } +tl::optional Split::as_dot() const { + RecordFormatter rr; + RecordFormatter r; + + r << this->inputs[0]->get_shape().as_dot(); + r << this->outputs[0]->get_shape().as_dot(); + rr << r; + + return rr; +} + bool Split::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc new file mode 100644 index 000000000..7052bb3ed --- /dev/null +++ b/src/parallel_ops/allreduce.cc @@ -0,0 +1,280 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/allreduce.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/hash_utils.h" + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::LogicalPartition; +using Legion::LogicalRegion; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::AllReduce; + +/* Params */ +bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) { + return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim; +} + +bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { + return input.is_valid(); +} + +AllReduceParams AllReduce::get_params() const { + AllReduceParams params; + params.allreduce_legion_dim = this->allreduce_dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } + return params; +} + +AllReduce::AllReduce(FFModel &model, + const ParallelTensor _input, + int _allreduce_legion_dim, + char const *name) + : ParallelOp(model, OP_ALLREDUCE, name, _input), + allreduce_dim(_allreduce_legion_dim) { + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + assert(dims[allreduce_dim].degree > 1); + // ParallelTensorBase::update_parallel_ids(numdim, dims); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, _input->data_type, this); +} + +AllReduce::AllReduce(FFModel &model, + AllReduceParams const ¶ms, + ParallelTensor const input, + char const *name) + : AllReduce(model, input, params.allreduce_legion_dim, params.name) {} + +void AllReduce::create_input_partition(FFModel &ff) { + // Do nothing + return; +} + + +OpMeta *AllReduce::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + AllReduce *ar = (AllReduce *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + AllReduceMeta *meta = new AllReduceMeta(handle, ar); + meta->input_type[0] = ar->inputs[0]->data_type; + meta->output_type[0] = ar->outputs[0]->data_type; + assert(meta->input_type[0] == meta->output_type[0]); + return meta; +} + +void AllReduce::init(FFModel const &ff) { + ArgumentMap argmap; + parallel_is = outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ALLREDUCE_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AllReduce)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void AllReduce::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void AllReduce::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +bool AllReduce::measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const { + cost_metrics = CostMetrics(); + cost_metrics.forward_time = 0.0f; + cost_metrics.backward_time = 0.0f; + + cost_metrics.sync_time = 0; + cost_metrics.inputs_memory = 0; + cost_metrics.outputs_memory = 0; + cost_metrics.weights_memory = 0; + return true; +} + +bool AllReduce::get_int_parameter(PMParameter para, int *value) const { + switch (para) { + case PM_ALLREDUCE_DIM: + *value = allreduce_dim; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool AllReduce::append_parallel_op_info( + std::vector ¶llel_ops) const { + ParallelOpInfo ret; + ret.op_type = op_type; + ret.parallel_dim = allreduce_dim; + ret.parallel_degree = -1; // AllReduce does not affect parallel degree + parallel_ops.push_back(ret); + return true; +} + +/*static*/ +void AllReduce::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void AllReduce::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::AllReduceParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.allreduce_legion_dim); + return key; +} + +} // namespace std diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp new file mode 100644 index 000000000..0aea27107 --- /dev/null +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -0,0 +1,58 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) + : OpMeta(handle) {} + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + input.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu new file mode 100644 index 000000000..1e932d2b1 --- /dev/null +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -0,0 +1,73 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) + : OpMeta(handle) {} + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + input.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + // std::cout <<"input volume: " << input.domain.get_volume() << "\n"; + // print_tensor((float*)input.ptr, 32, "input ptr"); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + output_grad.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 29f1d30d1..c66995877 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -76,6 +76,45 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int32_t const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index de208d2ae..6ed4f424c 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -68,6 +68,42 @@ template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int32_t const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index fee78043b..322ab061e 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -75,7 +75,7 @@ Replicate::Replicate(FFModel &model, dims[replicate_dim].degree *= replicate_degree; ParallelTensorBase::update_parallel_ids(numdim, dims); outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, DT_FLOAT, this); + numdim, dims, _input->data_type, this); // inputs[0]->print("Replicate::input"); // outputs[0]->print("Replicate::output"); } @@ -115,7 +115,7 @@ void Replicate::init(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); - IndexLauncher launcher(REPLICATE_FWD_TASK_ID, + IndexLauncher launcher(REPLICATE_INIT_TASK_ID, outputs[0]->parallel_is, TaskArgument(NULL, 0), argmap, @@ -141,9 +141,10 @@ void Replicate::forward(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); + DataType data_type = inputs[0]->data_type; IndexLauncher launcher(REPLICATE_FWD_TASK_ID, outputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -169,7 +170,7 @@ void Replicate::backward(FFModel const &ff) { assert(numInputs == 1); IndexLauncher launcher(REPLICATE_BWD_TASK_ID, inputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -227,12 +228,40 @@ bool Replicate::append_parallel_op_info( return true; } +void Replicate::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) {} + +/*static*/ void Replicate::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); + DataType data_type = *((DataType *)task->args); + if (data_type == DT_FLOAT) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + forward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Replicate forward"); + } +} + +template +void Replicate::forward_task_with_type( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( @@ -243,12 +272,12 @@ void Replicate::forward_task(Task const *task, assert(output_domain.hi()[i] == input_domain.hi()[i]); } assert(input_domain.get_volume() == output_domain.get_volume()); - float const *input_ptr = helperGetTensorPointerRO( + T const *input_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerRW( + T *output_ptr = helperGetTensorPointerRW( regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel(input_ptr, output_ptr, input_domain.get_volume()); + forward_kernel(input_ptr, output_ptr, input_domain.get_volume()); } void Replicate::backward_task(Task const *task, @@ -257,6 +286,28 @@ void Replicate::backward_task(Task const *task, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); + DataType data_type = *((DataType *)task->args); + if (data_type == DT_FLOAT) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + backward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Embedding forward"); + } +} + +template +void Replicate::backward_task_with_type( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); Domain output_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain input_grad_domain = runtime->get_index_space_domain( @@ -268,12 +319,12 @@ void Replicate::backward_task(Task const *task, } size_t num_elements = input_grad_domain.get_volume(); size_t num_replicas = output_grad_domain.get_volume() / num_elements; - float const *output_grad_ptr = helperGetTensorPointerRO( + T const *output_grad_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *input_grad_ptr = helperGetTensorPointerRW( + T *input_grad_ptr = helperGetTensorPointerRW( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel( + backward_kernel( output_grad_ptr, input_grad_ptr, num_elements, num_replicas); } diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 3e24f6b4e..a4a58e60f 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -61,6 +61,15 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { dst[i] = src[i]; } } +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + coord_t origin_size, + coord_t size) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[i % origin_size]; + } +} template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { @@ -215,7 +224,7 @@ __host__ void int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { - printf(" %.4lf", (float)host_ptr[idx]); + printf(" %.10lf", (float)host_ptr[idx]); if (idx >= 16) { break; } @@ -224,6 +233,76 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template +__host__ void + save_tensor(T const *ptr, size_t num_elements, char const *file_name) { + T *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(T) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.8f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + +cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( + cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { + int dims[MAX_TENSOR_DIM]; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[2] * dims[1], + dims[0], + 1, + 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[3] * dims[2] * dims[1], + dims[0], + 1, + 1); + } + default: + assert(false && "Unsupported dim number"); + } + return CUDNN_STATUS_BAD_PARAM; +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -340,6 +419,14 @@ template __global__ void template __global__ void copy_kernel(float *dst, float const *src, coord_t size); +template __global__ void copy_kernel_with_replicate(float *dst, + float const *src, + coord_t origin_size, + coord_t size); +template __global__ void copy_kernel_with_replicate( + int32_t *dst, int32_t const *src, coord_t origin_size, coord_t size); +template __global__ void copy_kernel_with_replicate( + int64_t *dst, int64_t const *src, coord_t origin_size, coord_t size); template __global__ void copy_kernel(int32_t *dst, int32_t const *src, coord_t size); template __global__ void @@ -370,3 +457,8 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void + save_tensor(float const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor(int32_t const *ptr, + size_t rect, + char const *file_name); diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index d8f4e6e17..e2debfa2d 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -168,10 +168,14 @@ std::string get_operator_type_name(OperatorType type) { return "Replicate"; case OP_REDUCTION: return "Reduction"; + case OP_ALLREDUCE: + return "AllReduce"; case OP_PIPELINE: return "Pipeline"; case OP_FUSED_PARALLEL: return "FusedParallelOp"; + case OP_GELU: + return "Gelu"; default: throw std::runtime_error("Operator type unsupported: " + std::to_string(type)); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 5dbdae1ac..762c5911d 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -39,6 +39,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/parallel_ops/combine.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" @@ -1882,11 +1883,11 @@ namespace { */ std::pair, std::unordered_map> try_one_lambda(std::pair &lambda, - Task const *task, + FFModel *model, std::shared_ptr &cached_simulator, bool perform_memory_search) { // Create a new fresh model - FFModel *model = *((FFModel **)task->args); + //FFModel *model = *((FFModel **)task->args); model->clear_graph_search_cache(); if (model->config.search_num_nodes.has_value()) { @@ -1900,6 +1901,70 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); + if (model->config.only_data_parallel) { + Graph *graph = new Graph(model); + graph->print_dot(); + std::unordered_map op_to_node_map; + for (FlexFlow::Op const *dstOp : model->operators) { + Node dstNode; + dstNode.ptr = dstOp; + dstNode.guid = model->node_global_guid++; + op_to_node_map[dstOp] = dstNode; + for (int j = 0; j < dstOp->numInputs; j++) { + FlexFlow::Op const *srcOp = dstOp->inputs[j]->owner_op; + assert(op_to_node_map.find(srcOp) != op_to_node_map.end()); + Node srcNode = op_to_node_map[srcOp]; + graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); + } + } + graph->print_dot(); + std::unique_ptr curr_best_graph; + std::unordered_map curr_optimal_views; + curr_best_graph = std::unique_ptr(graph); + MachineView data_parallel_view; + data_parallel_view.device_type = MachineView::GPU; + data_parallel_view.ndims = 1; + data_parallel_view.dim[0] = + model->config.numNodes * model->config.workersPerNode; + data_parallel_view.stride[0] = 1; + data_parallel_view.start_device_id = 0; + // Currently assume a 1D machine view is needed + assert(model->config.data_parallelism_degree == 1 || + model->config.tensor_parallelism_degree == 1); + int degree = model->config.data_parallelism_degree * + model->config.tensor_parallelism_degree; + for (auto const &node : curr_best_graph->inEdges) { + Op const *op = node.first.ptr; + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + int total_parallel_degree = 1; + for (int i = 0; i < op->outputs[0]->num_dims; i++) { + total_parallel_degree *= op->outputs[0]->dims[i].degree; + } + mv.dim[0] = total_parallel_degree; + mv.stride[0] = 1; + mv.start_device_id = 0; + // std::cout << mv.start_device_id + degree - 1 << "\n"; + // std::cout << model->config.numNodes << "\n"; + // std::cout << model->config.workersPerNode << "\n"; + // assert(false); + assert(mv.start_device_id + degree - 1 < + model->config.numNodes * model->config.workersPerNode); + curr_optimal_views[node.first] = mv; + for (int i = 0; i < node.first.ptr->numOutputs; i++) { + assert(node.first.ptr->outputs[i]->is_valid_machine_view(mv)); + } + } + // for (auto const &node : curr_best_graph->inEdges) { + // curr_optimal_views[node.first] = data_parallel_view; + // } + return std::make_pair(std::move(curr_best_graph), curr_optimal_views); + } + + Runtime *runtime = model->config.lg_hlr; + Context ctx = model->config.lg_ctx; + const Task* task = runtime->get_current_task(ctx); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) @@ -1936,42 +2001,14 @@ std::pair, std::unordered_map> std::unique_ptr curr_best_graph; std::unordered_map curr_optimal_views; - if (model->config.only_data_parallel) { - Graph *graph = new Graph(model); - std::unordered_map op_to_node_map; - for (FlexFlow::Op const *dstOp : model->operators) { - Node dstNode; - dstNode.ptr = dstOp; - dstNode.guid = model->node_global_guid++; - op_to_node_map[dstOp] = dstNode; - for (int j = 0; j < dstOp->numInputs; j++) { - FlexFlow::Op const *srcOp = dstOp->inputs[j]->owner_op; - assert(op_to_node_map.find(srcOp) != op_to_node_map.end()); - Node srcNode = op_to_node_map[srcOp]; - graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); - } - } - curr_best_graph = std::unique_ptr(graph); - MachineView data_parallel_view; - data_parallel_view.device_type = MachineView::GPU; - data_parallel_view.ndims = 1; - data_parallel_view.dim[0] = - model->config.numNodes * model->config.workersPerNode; - data_parallel_view.stride[0] = 1; - data_parallel_view.start_device_id = 0; - for (auto const &node : curr_best_graph->inEdges) { - curr_optimal_views[node.first] = data_parallel_view; - } - } else { - // Main step to optimize the PCG of an FFModel - model->graph_optimize(model->config.search_budget, - model->config.only_data_parallel, - curr_best_graph, - curr_optimal_views, - perform_memory_search, - MemoryOptimConfig{lambda.first}, - lambda.second); - } + // Main step to optimize the PCG of an FFModel + model->graph_optimize(model->config.search_budget, + model->config.only_data_parallel, + curr_best_graph, + curr_optimal_views, + perform_memory_search, + MemoryOptimConfig{lambda.first}, + lambda.second); // Return the best result of the current search return std::make_pair(std::move(curr_best_graph), curr_optimal_views); }; @@ -2043,12 +2080,20 @@ bool is_valid_strategy( * @param runtime Not used * @return GraphOptimalViewSerialized Serialized optimal PCG */ + GraphOptimalViewSerialized Graph::graph_optimize_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - auto model_config = (*((FFModel **)task->args))->config; + FFModel* model = *((FFModel **)task->args); + return Graph::graph_optimize_wrapper(model); +} + +/*static*/ +GraphOptimalViewSerialized + Graph::graph_optimize_wrapper(FFModel *model) { + auto model_config = model->config; bool perform_memory_search = model_config.perform_memory_search; float memory_threshold = model_config.device_mem; bool only_data_parallel = model_config.only_data_parallel; @@ -2064,7 +2109,7 @@ GraphOptimalViewSerialized // Be optimistic lambdas.emplace_back(std::make_pair(1.0, MemorySearchResult{})); auto try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); best_graph = std::move(try_result.first); optimal_views = try_result.second; @@ -2080,7 +2125,7 @@ GraphOptimalViewSerialized // Not found the strategy; need to do binary search lambdas.emplace_back(std::make_pair(0.0, MemorySearchResult{})); try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); best_graph = std::move(try_result.first); optimal_views = try_result.second; @@ -2107,7 +2152,7 @@ GraphOptimalViewSerialized lambdas.emplace_back(std::make_pair(mid, MemorySearchResult{})); try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); if (!is_valid_strategy(lambdas, try_result.first.get(), @@ -2259,6 +2304,7 @@ GraphOptimalViewSerialized case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); + sez.serialize(softmax->last_layer); break; } case OP_REPARTITION: { @@ -2278,6 +2324,13 @@ GraphOptimalViewSerialized sez.serialize(reduction->reduction_dim); sez.serialize(reduction->reduction_degree); break; + } + case OP_ALLREDUCE: { + AllReduce *allreduce = (AllReduce *)op; + sez.serialize(allreduce->allreduce_dim); + sez.serialize(strlen(allreduce->name)); + sez.serialize(allreduce->name, strlen(allreduce->name)); + break; } case OP_COMBINE: { Combine *combine = (Combine *)op; @@ -2640,8 +2693,11 @@ void FFModel::deserialize_graph_optimal_view( case OP_SOFTMAX: { assert(num_inputs == 1); int softmax_dim; + bool last_layer; dez.deserialize(softmax_dim); - node = get_or_create_node(inputs[0], {softmax_dim}); + dez.deserialize(last_layer); + node = + get_or_create_node(inputs[0], {softmax_dim, last_layer}); break; } case OP_TRANSPOSE: { @@ -2684,6 +2740,17 @@ void FFModel::deserialize_graph_optimal_view( {reduction_dim, reduction_degree}); break; } + case OP_ALLREDUCE: { + assert(num_inputs == 1); + int allreduce_dim; + dez.deserialize(allreduce_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + node = get_or_create_node(inputs[0], {allreduce_dim}); + break; + } case OP_FUSED_PARALLEL: { assert(num_inputs == 1); std::vector parallel_ops; diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 375b4f3d5..8617cb2ef 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -55,6 +55,16 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { } } +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + coord_t origin_size, + coord_t size) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[i % origin_size]; + } +} + template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { CUDA_KERNEL_LOOP(i, n) { @@ -298,6 +308,57 @@ miopenStatus_t return miopenStatusBadParm; } +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Domain domain) { + int dims[MAX_TENSOR_DIM]; + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1); + } + case 5: { + Rect<5> rect = domain; + int leading_dim_size = rect.hi[4] - rect.lo[4] + 1; + assert(leading_dim_size == 1); + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + } + default: + assert(false && "Unsupported dim number"); + } + return miopenStatusBadParm; +} + miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_FLOAT: @@ -353,6 +414,16 @@ template __global__ void template __global__ void copy_kernel(float *dst, float const *src, coord_t size); + +template __global__ void copy_kernel_with_replicate(float *dst, + float const *src, + coord_t origin_size, + coord_t size); +template __global__ void copy_kernel_with_replicate( + int32_t *dst, int32_t const *src, coord_t origin_size, coord_t size); +template __global__ void copy_kernel_with_replicate( + int64_t *dst, int64_t const *src, coord_t origin_size, coord_t size); + template __global__ void copy_kernel(int32_t *dst, int32_t const *src, coord_t size); template __global__ void diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc index dadece769..44dff5a2d 100644 --- a/src/runtime/machine_view.cc +++ b/src/runtime/machine_view.cc @@ -1,4 +1,5 @@ #include "flexflow/machine_view.h" +#include "flexflow/utils/hash_utils.h" namespace FlexFlow { @@ -47,15 +48,15 @@ size_t MachineView::num_parts() const { } size_t MachineView::hash() const { - size_t ret = 17; - ret = ret * 31 + std::hash()(device_type); - ret = ret * 31 + std::hash()(ndims); - ret = ret * 31 + std::hash()(start_device_id); + size_t h = 0; + hash_combine(h, device_type); + hash_combine(h, ndims); + hash_combine(h, start_device_id); for (int i = 0; i < ndims; i++) { - ret = ret * 31 + std::hash()(dim[i]); - ret = ret * 31 + std::hash()(stride[i]); + hash_combine(h, dim[i]); + hash_combine(h, stride[i]); } - return ret; + return h; } int MachineView::get_device_id(DomainPoint const &p) const { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index dbe4a7d92..6feddcd03 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -50,6 +50,7 @@ #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -78,10 +79,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - const ParallelTensor input1, - const ParallelTensor input2, - const ParallelTensor input3, - const ParallelTensor input4) + ParallelTensor const input1, + ParallelTensor const input2, + ParallelTensor const input3, + ParallelTensor const input4) : Op(model, otype, dtype, @@ -101,10 +102,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - const ParallelTensor _input1, - const ParallelTensor _input2, - const ParallelTensor _input3, - const ParallelTensor _input4) + ParallelTensor const _input1, + ParallelTensor const _input2, + ParallelTensor const _input3, + ParallelTensor const _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling) { @@ -584,9 +585,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, } } ncclComm_t ncclComm; + fprintf(stderr, "Before ncclCommInitRank\n"); checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank)); - // fprintf(stderr, "ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", - // ncclComm, allRanks, myRank, ncclId); + fprintf(stderr, + "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) " + "ncclId(%p)\n", + ncclComm, + allRanks, + myRank, + ncclId); return ncclComm; } #endif @@ -781,9 +788,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(const ParallelTensor output, +int Op::get_output_to_input_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor input) { + ParallelTensor const input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -816,9 +823,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output, return -1; } -int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, +int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor weight) { + ParallelTensor const weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -884,6 +891,9 @@ bool Op::check_output_input_weight_parallel_dims(bool allocate_weights) const { break; } + printf("other dim degree: %d, input dim degree %d\n", + other_dim.degree, + input_dim.degree); assert(other_dim.degree == input_dim.degree); assert(other_dim.parallel_idx == input_dim.parallel_idx); } @@ -893,18 +903,25 @@ bool Op::check_output_input_weight_parallel_dims(bool allocate_weights) const { bool Op::check_output_input_weight_same_parallel_is() const { assert(numOutputs > 0); IndexSpace parallel_is = outputs[0]->parallel_is; + std::cout << "output space: " + << ", " << parallel_is << "\n"; for (int i = 0; i < numOutputs; i++) { if (outputs[i]->parallel_is != parallel_is) { + std::cout << "output mismatch" + << "\n"; return false; } } for (int i = 0; i < numInputs; i++) { + std::cout << "input space: " << i << ", " << inputs[i]->parallel_is << "\n"; if (inputs[i]->parallel_is != parallel_is) { return false; } } for (int i = 0; i < numWeights; i++) { if (weights[i]->parallel_is != parallel_is) { + std::cout << "weight mismatch" + << "\n"; return false; } } @@ -946,7 +963,7 @@ void Op::set_argumentmap_for_init(FFModel const &ff, ArgumentMap &argmap) { for (PointInRectIterator it(rect); it(); it++) { \ FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ if (ff.config.computationMode == COMP_MODE_TRAINING && \ - op_type == OP_WEIGHT) { \ + (op_type == OP_WEIGHT || op_type == OP_ALLREDUCE)) { \ ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ handle.ncclComm = nccl_comms[idx++]; \ } \ @@ -1190,9 +1207,11 @@ FFModel::FFModel(FFConfig &_config) //} ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(config.workersPerNode * config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + // Rect<1> task_rect(Point<1>(0), + // Point<1>(config.workersPerNode * config.numNodes - 1)); + // IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + Domain domain = runtime->get_index_space_domain(ctx, config.all_gpu_task_is); + Rect<1> task_rect = domain; // int rank = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { @@ -1206,7 +1225,7 @@ FFModel::FFModel(FFConfig &_config) // Init CUDA library on each worker IndexLauncher initLauncher(FF_INIT_TASK_ID, - task_is, + config.all_gpu_task_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, @@ -1299,7 +1318,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1332,7 +1351,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1382,7 +1401,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], +ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1463,7 +1482,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], +ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1493,7 +1512,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1513,7 +1532,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1718,7 +1737,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - const Tensor tensor, ParallelTensor ¶llel_tensor) const { + Tensor const tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -1755,7 +1774,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -1778,7 +1797,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -1811,7 +1830,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -1835,7 +1854,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -1872,7 +1891,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(const ParallelTensor tensor, +void FFModel::create_disjoint_partition(ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -1920,7 +1939,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2302,7 +2321,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { +IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -2473,6 +2492,10 @@ void FFModel::update() { } } +void FFModel::unified_update() { + optimizer->unified_update(parameters); +} + Op *FFModel::get_final_operator() const { int idx = operators.size() - 1; while (operators[idx]->op_type == OP_INPUT || @@ -2503,9 +2526,10 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[l]->op_type == OP_WEIGHT) { continue; } - // don't fuse parallel op since they have different parallel_is in - // forward/backward - if (operators[l]->is_parallel_op()) { + // don't fuse parallel op except allReduce since they have different + // parallel_is in forward/backward + if (operators[l]->is_parallel_op() && + operators[l]->op_type != OP_ALLREDUCE) { continue; } size_t start = 0; @@ -2548,9 +2572,10 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[i]->op_type == OP_WEIGHT) { continue; } - // don't fuse parallel op since they have different parallel_is in - // forward/backward - if (operators[i]->is_parallel_op()) { + // don't fuse parallel op except allReduce since they have different + // parallel_is in forward/backward + if (operators[i]->is_parallel_op() && + operators[i]->op_type != OP_ALLREDUCE) { continue; } fused_op = new FusedOp(*this, operators[i]); @@ -2622,6 +2647,18 @@ Op *FFModel::create_operator_from_layer( dims[num_dims].degree = 1; dims[num_dims].parallel_idx = -1; dims[num_dims].is_replica_dim = true; + if (config.tensor_parallelism_degree > 1 && num_inputs != 1) { + dims[num_dims].size *= config.tensor_parallelism_degree; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } + //TODO temporary fix for input to attention QK, fix it after fuse the attention block + else if(config.tensor_parallelism_degree > 1){ + //n heads + dims[num_dims].size *= 12; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } // create_parallel_tensor adds an NoOp into operators ParallelTensor pt = create_parallel_tensor_legion_ordering(num_dims + 1, @@ -2635,11 +2672,14 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel) { - Repartition *part = new Repartition( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(part); - } + // if (config.only_data_parallel && + // config.computationMode == COMP_MODE_TRAINING) { + // Repartition *part = new Repartition( + // *this, pt, num_dims - 1, config.numNodes * + // config.workersPerNode); + // operators.push_back(part); + // } + num_inputs++; return operators[operators.size() - 1]; } case OP_MULTIHEAD_ATTENTION: { @@ -2782,9 +2822,42 @@ Op *FFModel::create_operator_from_layer( } } +bool FFModel::is_transformer_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + if (l->op_type == OP_DROPOUT && layer_idx >= 4 && + layers[layer_idx - 1]->op_type == OP_LINEAR && + layers[layer_idx - 2]->op_type == OP_RESHAPE && + layers[layer_idx - 3]->op_type == OP_TRANSPOSE && + layers[layer_idx - 4]->op_type == OP_BATCHMATMUL) { + return true; + } + return false; +} +bool FFModel::is_mlp_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + // standard opt relu + if (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) { + return true; + } + // mlp layer with relu embedded in first dense layer + if (l->op_type == OP_LINEAR && layer_idx >= 1 && + layers[layer_idx - 1]->op_type == OP_LINEAR) { + long long value; + layers[layer_idx - 1]->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; + if (activation == AC_MODE_RELU) { + return true; + } + } + return false; +} + void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; - for (auto const &l : layers) { + std::map tensors_to_parallel_tensors; + for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { + auto const &l = layers[layer_idx]; std::vector inputs; for (int i = 0; i < l->numInputs; i++) { // create new input tensors @@ -2792,7 +2865,50 @@ void FFModel::create_operators_from_layers() { tensors_to_parallel_tensors.end()); inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); } - Op *op = create_operator_from_layer(l, inputs); + // Op *op = create_operator_from_layer(l, inputs); + Op *op = nullptr; + if (config.tensor_parallelism_degree > 1 && l->op_type == OP_LAYERNORM && + layer_idx == layers.size() - 6) { + std::vector partitioned_inputs; + Combine *comb = new Combine(*this, + inputs[0], + 3 /*inner most dim*/, + config.tensor_parallelism_degree); + partitioned_inputs.push_back(comb->outputs[0]); + operators.push_back(comb); + op = create_operator_from_layer(l, partitioned_inputs); + } else { + op = create_operator_from_layer(l, inputs); + } + + // add replicate operators after op if needed + if (config.tensor_parallelism_degree > 1 && l->op_type == OP_EMBEDDING) { + // assert(op->numOutputs == 1); + // Replicate *repl = new Replicate(*this, + // op->outputs[0], + // op->outputs[0]->num_dims - 1, + // config.tensor_parallelism_degree); + // operators.push_back(repl); + // op = repl; + } else if (config.tensor_parallelism_degree > 1 && + (is_transformer_block(layer_idx) || is_mlp_block(layer_idx) || + // llama mlp layer + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_GELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + // LLAMA without element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR))) { + assert(op->numOutputs == 1); + AllReduce *allreduce = + new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1); + operators.push_back(allreduce); + op = allreduce; + } assert(op->numOutputs == l->numOutputs); for (int i = 0; i < op->numOutputs; i++) { tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; @@ -2823,12 +2939,15 @@ void FFModel::compile(LossType loss_type, // Launch the graph optimize task { FFModel *model = this; - TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, - TaskArgument(&model, sizeof(FFModel *))); - Future future = runtime->execute_task(ctx, launcher); - - PCG::GraphOptimalViewSerialized ret = - future.get_result(); + PCG::GraphOptimalViewSerialized ret; + if (false) { + TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, + TaskArgument(&model, sizeof(FFModel *))); + Future future = runtime->execute_task(ctx, launcher); + ret = future.get_result(); + } else { + ret = PCG::Graph::graph_optimize_wrapper(this); + } Deserializer dez(ret.data, ret.total_bytes); // Reconstruct operators PCG::Graph *best_graph = new PCG::Graph(this); @@ -2961,6 +3080,60 @@ void FFModel::compile(LossType loss_type, } } + int degree = + config.data_parallelism_degree * config.tensor_parallelism_degree; + + for (int op_idx = 0; op_idx < operators.size(); op_idx++) { + Op const *op = operators[op_idx]; + // Skip weight operators + if (op->op_type == OP_WEIGHT) { + continue; + } + // Get machine views + std::vector machine_views; + for (int j = 0; j < config.data_parallelism_degree; j++) { + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + // mv.start_device_id = 0; + mv.stride[0] = 1; + int parallel_degree = 1; + for (int k = 0; k < op->outputs[0]->num_dims; k++) { + parallel_degree *= op->outputs[0]->dims[k].degree; + } + mv.dim[0] = parallel_degree; + mv.start_device_id = 0; + assert(mv == op->outputs[0]->machine_view); + machine_views.push_back(mv); + } + for (int i = 0; i < op->numOutputs; i++) { + ParallelTensor pt_base = op->outputs[i]; + + if (op->op_type == OP_REPLICATE) { + assert(op->numInputs == 1 && op->numOutputs == 1); + } + std::vector list; + bool found_parallel_tensor = false; + if (!found_parallel_tensor) { + for (int j = 0; j < config.data_parallelism_degree; j++) { + // Copy the metadata from pt_base to pt + ParallelTensor pt = new ParallelTensorBase(*pt_base); + pt->region = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part = runtime->get_logical_partition( + ctx, pt->region, pt_base->part.get_index_partition()); + pt->machine_view = machine_views[j]; + Domain part_domain = + runtime->get_index_space_domain(ctx, pt_base->parallel_is); + assert(pt->machine_view.get_domain() == part_domain); + list.push_back(pt); + } + } + } + } + // Perform fusion optimizations if (config.perform_fusion) { fprintf(stderr, "Applying fusion optimizations during compilation...\n"); @@ -3468,34 +3641,34 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - const static int epochs = 1; + static int const epochs = 1; // const static int iterations = 1; - const static int batchSize = 64; - const static bool profiling = false; + static int const batchSize = 64; + static bool const profiling = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)1 * 1024 * 1024 * 1024; // 2GB - const static int numNodes = 1; - const static int workersPerNode = 0; - const static int cpusPerNode = 0; - const static size_t searchBudget = -1; - const static size_t simulatorWorkSpaceSize = + static size_t const workSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB + static int const numNodes = 1; + static int const workersPerNode = 0; + static int const cpusPerNode = 0; + static size_t const searchBudget = -1; + static size_t const simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB constexpr static float searchAlpha = 1.2f; - const static bool searchOverlapBackwardUpdate = false; - const static bool onlyDataParallel = false; - const static bool enableSampleParallel = true; - const static bool enableParameterParallel = false; - const static bool enableAttributeParallel = false; - const static bool enableInplaceOptimizations = false; - const static bool allowTensorOpMathConversion = false; - const static int machine_model_version = 0; - const static int simulator_segment_size = 16777216; // 16 MB - const static int simulator_max_num_segments = 1; - const static int base_optimize_threshold = 10; - const static bool enable_control_replication = true; + static bool const searchOverlapBackwardUpdate = false; + static bool const onlyDataParallel = false; + static bool const enableSampleParallel = true; + static bool const enableParameterParallel = false; + static bool const enableAttributeParallel = false; + static bool const enableInplaceOptimizations = false; + static bool const allowTensorOpMathConversion = false; + static int const machine_model_version = 0; + static int const simulator_segment_size = 16777216; // 16 MB + static int const simulator_max_num_segments = 1; + static int const base_optimize_threshold = 10; + static bool const enable_control_replication = true; // The default python data loader type is 2 to enable control replication - const static int python_data_loader_type = 2; + static int const python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -3515,6 +3688,8 @@ FFConfig::FFConfig() { search_overlap_backward_update = DefaultConfig::searchOverlapBackwardUpdate; computationMode = COMP_MODE_TRAINING; only_data_parallel = DefaultConfig::onlyDataParallel; + data_parallelism_degree = 1; + tensor_parallelism_degree = 1; enable_sample_parallel = DefaultConfig::enableSampleParallel; enable_parameter_parallel = DefaultConfig::enableParameterParallel; enable_attribute_parallel = DefaultConfig::enableAttributeParallel; @@ -3560,6 +3735,9 @@ FFConfig::FFConfig() { Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; lg_ctx = Runtime::get_context(); + Rect<1> task_rect(Point<1>(0), Point<1>(workersPerNode * numNodes - 1)); + // Create an index space for tasks running on all GPUs + all_gpu_task_is = runtime->create_index_space(lg_ctx, task_rect); field_space = runtime->create_field_space(lg_ctx); } @@ -3620,6 +3798,16 @@ void FFConfig::parse_args(char **argv, int argc) { only_data_parallel = true; continue; } + // data parallelism degree + if (!strcmp(argv[i], "-data-parallelism-degree")) { + data_parallelism_degree = std::stoi(argv[++i]); + continue; + } + // tensor parallelism degree + if (!strcmp(argv[i], "-tensor-parallelism-degree")) { + tensor_parallelism_degree = std::stoi(argv[++i]); + continue; + } if ((!strcmp(argv[i], "--enable-parameter-parallel"))) { enable_parameter_parallel = true; continue; @@ -5126,6 +5314,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } // Replicate + { + TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Replicate Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(REPLICATE_FWD_TASK_ID, "Replicate Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -5183,6 +5385,49 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + // AllReduce + { + TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // FusedParallelOp { TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID, @@ -5275,6 +5520,23 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ADAM_UNIFY_UPD_NCCL_TASK_ID, + "Adam unified NCCL Update"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AdamOptimizer::nccl_unified_update_task>( + registrar, "Adam unified NCCL Update Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } #endif // Initializer { diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 41dd37dec..322d7840f 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -28,6 +28,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/parallel_ops/combine.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" @@ -94,6 +95,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Reduction *)op)->get_params(); case OP_COMBINE: return ((Combine *)op)->get_params(); + case OP_ALLREDUCE: + return ((AllReduce *)op)->get_params(); case OP_FUSED_PARALLEL: return ((FusedParallelOp *)op)->get_params(); case OP_TRANSPOSE: diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index c42a0c9aa..91a16e8db 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -333,6 +333,7 @@ void AdamOptimizer::init(void) { Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; Initializer *initializer = new ZeroInitializer(); + reservedWorkSpaceSize = 0; for (size_t i = 0; i < model->parameters.size(); i++) { ParallelTensor p = model->parameters[i]; Domain domain = @@ -381,6 +382,7 @@ void AdamOptimizer::update(const ParallelTensor p) { assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); + reservedWorkSpaceSize += p->get_volume() * sizeof(float); if (p->sync_type == ParameterSyncType::PS) { TaskLauncher launcher(ADAM_UPD_PS_TASK_ID, TaskArgument(this, sizeof(AdamOptimizer)), @@ -492,6 +494,119 @@ void AdamOptimizer::update(const ParallelTensor p) { } } +void SGDOptimizer::unified_update(std::vector const parameters) { + //todo +} + +void AdamOptimizer::unified_update(std::vector const parameters) { + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + const ParallelTensor p0 = parameters.at(0); + ArgumentMap argmap; + Domain domain = runtime->get_index_space_domain(ctx, p0->parallel_is); + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + OpMeta *mp = p0->owner_op->meta[idx++]; \ + argmap.set_point(*it, TaskArgument(&mp, sizeof(OpMeta *))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + + int offset = 0; + int processed_parameters_num = 0; + // printf("param size: %d\n", parameters.size()); + + size_t workSpaceSize = model->handlers->workSpaceSize * + model->config.workersPerNode * model->config.numNodes; + + while (processed_parameters_num < parameters.size()) { + parameters_num = 0; + + for(int i = processed_parameters_num; i < parameters.size(); i++){ + const ParallelTensor p = parameters.at(i); + assert(v_values.find(p->region) != v_values.end()); + assert(m_values.find(p->region) != m_values.end()); + assert(p->owner_op != NULL); + if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= workSpaceSize) { + break; + } + reservedWorkSpaceSize += p->get_volume() * sizeof(float); + parameters_num += 1; + assert(p->sync_type == ParameterSyncType::NCCL); + assert(p->parallel_is != IndexSpace::NO_SPACE); + } + + // printf("parameters_num: %d %zu, %zu, %d\n", parameters_num, + // reservedWorkSpaceSize, model->handlers->workSpaceSize, + // parameters.size()); + assert(processed_parameters_num <= parameters.size()); + + IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, + p0->parallel_is, + TaskArgument(this, sizeof(AdamOptimizer)), + argmap, + Predicate::TRUE_PRED, + false /*must_epoch*/, + 0 /*mapper_id*/, + p0->machine_view.hash()); + // launch a unified task + for (int j = 0; j < parameters_num; j++) { + const ParallelTensor p = parameters.at(processed_parameters_num + j); + + // regions[0]: region_grad + launcher.add_region_requirement(RegionRequirement(p->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + p->region_grad)); + launcher.add_field(offset, FID_DATA); + // regions[1]: region + launcher.add_region_requirement(RegionRequirement( + p->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, p->region)); + launcher.add_field(offset + 1, FID_DATA); + // regions[2]: w_region + launcher.add_region_requirement( + RegionRequirement(v_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + v_values[p->region]->region)); + launcher.add_field(offset + 2, FID_DATA); + // regions[3]: m_region + launcher.add_region_requirement( + RegionRequirement(m_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + m_values[p->region]->region)); + launcher.add_field(offset + 3, FID_DATA); + offset += 4; + } + + // update alpha, beta + for (int i = 0; i < parameters_num; i++) { + this->next(); + } + launcher.concurrent = true; + FutureMap fm = runtime->execute_index_space(ctx, launcher); + // runtime->execute_must_epoch(ctx, must_epoch_launcher); + runtime->issue_execution_fence(ctx); + reservedWorkSpaceSize = 0; + offset = 0; + processed_parameters_num += parameters_num; + } + parameters_num = 0; +} + void AdamOptimizer::ps_update_task(Task const *task, std::vector const ®ions, Context ctx, @@ -605,6 +720,72 @@ void AdamOptimizer::nccl_update_task(Task const *task, nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } + +void AdamOptimizer::nccl_unified_update_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // assert(regions.size() == 4); + // assert(task->regions.size() == 4); + AdamOptimizer const *op = (AdamOptimizer *)task->args; + OpMeta const *meta = *((OpMeta **)task->local_args); + // FFHandler handler = *((FFHandler*) task->local_args); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + // float const *w_grad_ptr[op->parameters_num]; + // float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], + // *m_ptr[op->parameters_num]; + + // hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + GenericTensorAccessorR accWGrads[op->parameters_num]; + GenericTensorAccessorW accWs[op->parameters_num]; + GenericTensorAccessorW accVs[op->parameters_num]; + GenericTensorAccessorW accMs[op->parameters_num]; + size_t *size = new size_t[op->parameters_num]; + int offset = 0; + + // printf("parameters_num: %d\n", op->parameters_num); + + for (int i = 0; i < op->parameters_num; i++) { + accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, + regions[offset], + task->regions[offset], + FID_DATA, + ctx, + runtime); + accWs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 1], + task->regions[offset + 1], + FID_DATA, + ctx, + runtime); + accVs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 2], + task->regions[offset + 2], + FID_DATA, + ctx, + runtime); + accMs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 3], + task->regions[offset + 3], + FID_DATA, + ctx, + runtime); + offset += 4; + + size[i] = accWGrads[i].domain.get_volume(); + // w_grad_ptr[i] = accWGrad.get_float_ptr(); + // w_ptr[i] = accW.get_float_ptr(); + // v_ptr[i] = accV.get_float_ptr(); + // m_ptr[i] = accM.get_float_ptr(); + } + nccl_unified_update_task_gpu(op, meta, accWGrads, size, accWs, accVs, accMs); +} #endif }; // namespace FlexFlow diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 232799e02..373eb3fe7 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -87,6 +87,7 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, #ifdef FF_USE_NCCL __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, + OpMeta const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, @@ -203,11 +204,13 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, m_ptr, v_ptr, w_ptr); + // checkCUDA(hipDeviceSynchronize()); } #ifdef FF_USE_NCCL __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, @@ -243,6 +246,74 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, w_ptr); // checkCUDA(hipDeviceSynchronize()); } + +__host__ void AdamOptimizer::nccl_unified_update_task_gpu( + AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs) { + + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + + void *workSpace_ptr = meta->handle.workSpace; + + for (int i = 0; i < op->parameters_num; i++) { + hipMemcpyAsync(workSpace_ptr, + accWGrads[i].get_float_ptr(), + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + } + + // do allreduce once + checkNCCL(ncclAllReduce(meta->handle.workSpace, + (float *)meta->handle.workSpace, + meta->handle.workSpaceSize, + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); + + workSpace_ptr = static_cast(meta->handle.workSpace); + float alpha_t = op->alpha_t; + float beta1_t = op->beta1_t; + float beta2_t = op->beta2_t; + for (int i = 0; i < op->parameters_num; i++) { + // update + // printf("update %d\n", i); + hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), + GET_BLOCKS(size[i]), + CUDA_NUM_THREADS, + 0, + stream, + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + accMs[i].get_float_ptr(), + accVs[i].get_float_ptr(), + accWs[i].get_float_ptr()); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + + // update + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } + + // checkCUDA(hipDeviceSynchronize()); +} #endif }; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index 5f654fbb5..17adce94b 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -216,6 +216,103 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, w_ptr); // checkCUDA(cudaDeviceSynchronize()); } + +__host__ void AdamOptimizer::nccl_unified_update_task_gpu( + AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + + cudaEvent_t t_start, t_start1, t_start2, t_end; + cudaEventCreate(&t_start); + cudaEventCreate(&t_start1); + cudaEventCreate(&t_start2); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + cudaEventRecord(t_start1, stream); + cudaEventRecord(t_start2, stream); + + void *allocate_ptr; + // = meta->handle.workSpace; + checkCUDA( + cudaMalloc(&allocate_ptr,meta->handle.workSpaceSize)); + + void *workSpace_ptr = allocate_ptr; + + for (int i = 0; i < op->parameters_num; i++) { + cudaMemcpyAsync(workSpace_ptr, + accWGrads[i].get_float_ptr(), + size[i] * sizeof(float), + cudaMemcpyDeviceToDevice, + stream); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + } + + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start1, t_end)); + cudaEventDestroy(t_start1); + printf("[optimizer] data copy time = %.2lfms\n", elapsed); + + // do allreduce once + checkNCCL(ncclAllReduce(meta->handle.workSpace, + (float *)meta->handle.workSpace, + meta->handle.workSpaceSize, + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start2, t_end)); + cudaEventDestroy(t_start2); + printf("[optimizer] allreduce time = %.2lfms\n", elapsed); + + // workSpace_ptr = static_cast(meta->handle.workSpace); + workSpace_ptr = static_cast(allocate_ptr); + float alpha_t = op->alpha_t; + float beta1_t = op->beta1_t; + float beta2_t = op->beta2_t; + for (int i = 0; i < op->parameters_num; i++) { + // update + // printf("update %d\n", i); + adam_update<<>>( + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + accMs[i].get_float_ptr(), + accVs[i].get_float_ptr(), + accWs[i].get_float_ptr()); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + + // update + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + checkCUDA(cudaFree(allocate_ptr)); + printf("[optimizer] total time = %.2lfms\n", elapsed); +} #endif }; // namespace FlexFlow diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 963ad8af7..b9f3dc89f 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -135,10 +135,15 @@ bool ParallelTensorShape::operator!=(ParallelTensorShape const &other) const { size_t ParallelTensorShape::get_piece_size() const { size_t piece_size = data_type_size(this->data_type); + return piece_size * this->get_piece_num_elements(); +} + +size_t ParallelTensorShape::get_piece_num_elements() const { + size_t piece_num_elements = 1; for (int i = 0; i < this->num_dims; i++) { - piece_size *= this->dims[i].size / this->dims[i].degree; + piece_num_elements *= this->dims[i].size / this->dims[i].degree; } - return piece_size; + return piece_num_elements; } RecordFormatter ParallelTensorShape::as_dot() const { @@ -655,18 +660,42 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, // TODO: check data type matches // TODO: Currently we use a task launch, change to index launch for NCCL // parameter - size_t volume = 1, num_replicas = 0; + size_t volume = 1, num_replicas = 1; if (sync_type == ParameterSyncType::NCCL) { - Domain domain = runtime->get_index_space_domain(ctx, parallel_is); - num_replicas = domain.get_volume(); + // Domain domain = runtime->get_index_space_domain(ctx, parallel_is); + // num_replicas = domain.get_volume(); + for (int i = 0; i < this->num_dims; i++) { + if (this->dims[i].is_replica_dim) { + num_replicas *= this->dims[i].size; + } + } } else if (sync_type == ParameterSyncType::PS) { num_replicas = 1; } else { - num_replicas = 1; + for (int i = 0; i < this->num_dims; i++) { + if (this->dims[i].is_replica_dim) { + num_replicas *= this->dims[i].size; + } + } + // num_replicas = 1; } for (size_t i = 0; i < dim_sizes.size(); i++) { volume = volume * dim_sizes[i]; } + // Debug prints + { + std::string tensor_name; + if (owner_op == nullptr) { + tensor_name = "No OwnerOp"; + } else { + tensor_name = std::string(owner_op->name); + } + std::ostringstream oss; + for (int i = 0; i < dim_sizes.size(); i++) + oss << dim_sizes[i] << ", "; + printf("%s num_replicas(%zu) volume(%zu) dims(%s)\n", tensor_name.c_str(), + num_replicas, volume, oss.str().c_str()); + } RegionRequirement req(region, READ_WRITE, EXCLUSIVE, region); req.add_field(FID_DATA); InlineLauncher launcher(req); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index f852acaa6..4f44a3a57 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -34,6 +34,7 @@ #include "flexflow/ops/split.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -1198,6 +1199,7 @@ void Graph::export_strategy_computation_graph( for (auto const &node : s.get_nodes(*this)) { // Add node if (strategy.find(node) == strategy.end()) { + dot.add_node(node, {{"label", node.to_string()}}); // Check FusedParallel node here and print out the detailed information if (node.ptr->op_type == OperatorType::OP_FUSED_PARALLEL) { RecordFormatter rf; @@ -1904,6 +1906,7 @@ void GraphSearchHelper::graph_optimize( this->logger->debug() << "Starting graph optimization"; Graph *graph = this->construct_graph(); + graph->print_dot(); graph->duplicate_input_nodes(); std::unordered_map empty_strategy; if (!this->config.export_strategy_computation_graph_file.empty()) { @@ -3602,6 +3605,7 @@ void FFModel::graph_optimize( this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); } + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3702,7 +3706,8 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); + new_op = new Softmax( + *this, inputs[0], softmax->dim, softmax->last_layer, NULL); break; } case OP_COMBINE: { @@ -3739,6 +3744,12 @@ bool FFModel::convert_graph_to_operators( reduction->reduction_degree); break; } + case OP_ALLREDUCE: { + assert(inList.size() == 1); + AllReduce *allreduce = (AllReduce *)node.ptr; + new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim); + break; + } case OP_FUSED_PARALLEL: { assert(inList.size() == 1); FusedParallelOp *fused = (FusedParallelOp *)node.ptr;