flexflow · chenzhuofu · Sep 7, 2024 · Sep 7, 2024 · Sep 7, 2024 · Sep 8, 2024
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,8 @@ python/flexflow/core/flexflow_cffi_header.py
 *.pb.h
 *.o
 *.a
+*.nsys-rep
+*.nfs*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -188,3 +190,8 @@ python/flexflow/version.txt
 
 inference_tensors
 tests/inference/python_test_configs/*.json
+
+core.*
+*.out
+sharegpt.json
+wildchat.json
diff --git a/.gitmodules b/.gitmodules
@@ -22,4 +22,10 @@
 [submodule "deps/tokenizers-cpp"]
 	path = deps/tokenizers-cpp
 	url = https://github.com/mlc-ai/tokenizers-cpp.git
-	fetchRecurseSubmodules = true
+	fetchRecurseSubmodules = true
+[submodule "deps/flashinfer"]
+	path = deps/flashinfer
+	url = https://github.com/flashinfer-ai/flashinfer.git
+[submodule "deps/raft"]
+	path = deps/raft
+	url = https://github.com/rapidsai/raft.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,6 +4,12 @@ project(FlexFlow)
 
 include(ExternalProject)
 
+enable_language(CXX)
+enable_language(CUDA)
+if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)
+    message(FATAL_ERROR "Your C++ compiler is too old. Please upgrade to version 8 or higher.")
+endif()
+
 # Set policy CMP0074 to eliminate cmake warnings
 cmake_policy(SET CMP0074 NEW)
 cmake_policy(SET CMP0077 NEW)
@@ -128,6 +134,9 @@ list(APPEND CC_FLAGS
 list(APPEND NVCC_FLAGS
   -std=c++17)
 
+list(APPEND NVCC_FLAGS
+  --expt-relaxed-constexpr
+  --extended-lambda)
 
 add_compile_options(${CC_FLAGS})
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS})
@@ -201,6 +210,12 @@ if(NOT BUILD_LEGION_ONLY)
   # optional
   include(optional)
 
+  set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/build/install)
+  find_package(raft)
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/include)
+
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include)
+
   if (FF_GPU_BACKEND STREQUAL "cuda")
     list(APPEND FF_CC_FLAGS
       -DFF_USE_CUDA)
@@ -290,6 +305,12 @@ if(NOT BUILD_LEGION_ONLY)
       LIST_DIRECTORIES False
       ${FLEXFLOW_ROOT}/src/*.cu)
 
+    # tensorrt_llm custom allreduce
+    if(FF_USE_NCCL)
+      list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm)
+      list(APPEND FLEXFLOW_GPU_SRC ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu)
+    endif()
+
     add_compile_definitions(FF_USE_CUDA)
 
     if(BUILD_SHARED_LIBS)
@@ -397,6 +418,8 @@ if(NOT BUILD_LEGION_ONLY)
     target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
   endif()
 
+  target_link_libraries(flexflow raft::raft)
+
   #library api version, bump from time to time
   set(SOVERSION 1)
 
@@ -425,7 +448,7 @@ if(NOT BUILD_LEGION_ONLY)
       # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
       add_custom_command(TARGET flexflow
         POST_BUILD	
-        COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
+        COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
       )
       # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
@@ -557,7 +580,9 @@ if(NOT BUILD_LEGION_ONLY)
 
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
+    add_subdirectory(inference/simplified_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/trace_generator)
   endif()
 
 

diff --git a/FlexFlow.mk b/FlexFlow.mk
@@ -95,9 +95,12 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1)
 endif
 
 
-INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
+INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src \
+				-I${FF_HOME}/deps/raft/cpp/include -I${FF_HOME}/deps/rmm/include -I${FF_HOME}/deps/spdlog/include \
+				-I${FF_HOME}/deps/flashinfer/include
 CC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
-NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
+NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 \
+			    --expt-relaxed-constexpr --extended-lambda
 HIPCC_FLAGS     += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
 GASNET_FLAGS	+=
 # For Point and Rect typedefs

diff --git a/benchmarking/average_accepted_tokens.pdf b/benchmarking/average_accepted_tokens.pdf
diff --git a/benchmarking/benchmark_incr_dec.sh b/benchmarking/benchmark_incr_dec.sh
@@ -0,0 +1,88 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/../build"
+
+# export BUILD_TYPE=Debug
+# ../config/config.linux
+make -j install
+
+model_name=meta-llama/Llama-3.1-70B-Instruct
+NGPUS=8
+NCPUS=16
+FSIZE=36000
+ZSIZE=200000
+CSIZE=100000
+
+# comment these lines in for debugging
+# model_name=meta-llama/Llama-3.1-8B-Instruct
+# NGPUS=8
+# FSIZE=36000
+# ZSIZE=30000
+# CSIZE=100000
+
+
+
+MAX_SEQ_LEN=7000
+tokens_per_batch=1024
+
+batch_sizes=(
+    8
+    4
+)
+
+request_per_second_values=(
+    -1
+    1
+    2
+    4
+    8
+)
+
+dataset_name="sharegpt"
+dataset_fp="../benchmarking/${dataset_name}.json"
+partition_name="all"
+
+export LEGION_BACKTRACE=1
+
+# python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='meta-llama/Llama-3.1-70B-Instruct', allow_patterns='*.safetensors', max_workers=30)"
+# python ../inference/utils/download_hf_model.py --half-precision-only $model_name --refresh-cache
+
+for k in "${!request_per_second_values[@]}"; do
+for j in "${!batch_sizes[@]}"; do
+    batch_size=${batch_sizes[$j]}
+    request_per_second=${request_per_second_values[$k]}
+
+    echo "Running dataset ${dataset_fp} with model ${model_name}, batch size ${batch_size}, tokens per batch ${tokens_per_batch}, and request per second ${request_per_second}"
+    # create model name version where "/" is replaced with "-"
+    model_name_=$(echo $model_name | tr / -)
+    if [ $request_per_second -gt 0 ]; then
+        rate=$request_per_second
+    else
+        rate="offline"
+    fi
+    log_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.log"
+    output_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.json"
+    metrics_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv"
+    rm $metrics_fp $output_fp $log_fp || true
+
+    time ./inference/simplified_infer/incr_dec \
+        -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \
+        -tensor-parallelism-degree $NGPUS \
+        -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \
+        --fusion \
+        --max-sequence-length $MAX_SEQ_LEN \
+        --max-requests-per-batch $batch_size \
+        --max-tokens-per-batch $tokens_per_batch \
+        --max-output-length 1024 \
+        --request-per-second ${request_per_second} \
+        -llm-model $model_name \
+        -trace ${dataset_fp} \
+        -trace-output-path ${output_fp} \
+        -csv-output-path $metrics_fp \
+        -target-partition ${partition_name} \
+        2>&1 | tee ${log_fp}
+done
+done
diff --git a/benchmarking/benchmark_specinfer.sh b/benchmarking/benchmark_specinfer.sh
@@ -0,0 +1,109 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/../build"
+
+# export BUILD_TYPE=Debug
+# ../config/config.linux
+make -j
+source ./set_python_envs.sh
+# reset
+
+model_name=meta-llama/Llama-3.1-70B-Instruct
+NGPUS=8
+NCPUS=16
+FSIZE=36000
+ZSIZE=200000
+CSIZE=100000
+
+# comment these lines in for debugging
+# model_name=meta-llama/Llama-3.1-8B-Instruct
+# NGPUS=8
+# FSIZE=36000
+# ZSIZE=30000
+# CSIZE=100000
+######################################
+
+small_model_names=(
+    Zhuominc/Llama-3-330M
+    meta-llama/Llama-3.2-1B-Instruct
+    meta-llama/Llama-3.2-3B-Instruct
+    meta-llama/Llama-3.1-8B-Instruct
+)
+
+MAX_SEQ_LEN=7000
+tokens_per_batch=1024
+max_tree_depth=8
+expansion_degree=3
+
+batch_sizes=(
+    8
+    4
+)
+
+request_per_second_values=(
+    -1
+    1
+    2
+    4
+    8
+)
+
+dataset_name="sharegpt"
+dataset_fp="../benchmarking/${dataset_name}.json"
+partition_name="all"
+
+export LEGION_BACKTRACE=1
+
+# python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='meta-llama/Llama-3.1-70B-Instruct', allow_patterns='*.safetensors', max_workers=30)"
+python ../inference/utils/download_hf_model.py --half-precision-only $model_name
+for small_model_name in "${small_model_names[@]}"; do
+    python ../inference/utils/download_hf_model.py --half-precision-only $small_model_name
+done
+
+for k in "${!request_per_second_values[@]}"; do
+for j in "${!batch_sizes[@]}"; do
+for i in "${!small_model_names[@]}"; do
+    small_model_name=${small_model_names[$i]}
+    batch_size=${batch_sizes[$j]}
+    request_per_second=${request_per_second_values[$k]}
+
+    echo "Running dataset ${dataset_fp} with model ${model_name}, draft model ${small_model_name}, batch size ${batch_size}, tokens per batch ${tokens_per_batch}, and request per second ${request_per_second}"
+    # create model name version where "/" is replaced with "-"
+    model_name_=$(echo $model_name | tr / -)
+    small_model_name_=$(echo $small_model_name | tr / -)
+    if [ $request_per_second -gt 0 ]; then
+        rate=$request_per_second
+    else
+        rate="offline"
+    fi
+    log_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.log"
+    output_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.json"
+    metrics_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv"
+    rm $metrics_fp $output_fp $log_fp || true
+
+    time ./inference/suffix_decoding/specinfer \
+        -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \
+        -tensor-parallelism-degree $NGPUS \
+        -ssm-tp-degree $NGPUS \
+        -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \
+        --fusion \
+        --max-sequence-length $MAX_SEQ_LEN \
+        --max-requests-per-batch $batch_size \
+        --max-tokens-per-batch $tokens_per_batch \
+        --max-output-length 1024 \
+        --max-tree-depth ${max_tree_depth} \
+        --expansion-degree ${expansion_degree} \
+        --request-per-second ${request_per_second} \
+        -llm-model $model_name \
+        -ssm-model $small_model_name \
+        -trace ${dataset_fp} \
+        -trace-output-path ${output_fp} \
+        -csv-output-path $metrics_fp \
+        -target-partition ${partition_name} \
+        2>&1 | tee ${log_fp}
+done
+done
+done