Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ ShareGPT_V3_unfiltered_cleaned_split.json

.vscode/settings.json

ibm-triton-lib/ibm_triton_lib.egg-info/

3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
[submodule "vllm"]
path = vllm
url = https://github.com/vllm-project/vllm.git
[submodule "third_party/fmwork"]
path = third_party/fmwork
url = [email protected]:bringlein/fmwork.git
76 changes: 68 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ARG BASE_UBI_IMAGE_TAG=9.4
ARG PYTHON_VERSION=3.12
ARG MAX_JOBS=64
ARG PIP_VLLM_VERSION=0.8.1
# TODO add ARG CUDA_VERSION=12-8

ARG VLLM_SOURCE=pip
# or VLLM_SOURCE=custom
Expand Down Expand Up @@ -122,6 +123,31 @@ ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py bdist_wheel --dist-dir=/workspace/

# ## flashinfer Builder #################################################################
# FROM vllm-builder_custom AS flashinfer-builder
# ARG MAX_JOBS
#
# # # build deps?
# # RUN --mount=type=cache,target=/root/.cache/pip \
# # --mount=type=cache,target=/root/.cache/uv \
# # uv pip install ninja cmake wheel pybind11 setuptools
#
# WORKDIR /workspace/flashinfer
# RUN git clone --recursive https://github.com/flashinfer-ai/flashinfer.git
#
# ENV TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
# ENV FLASHINFER_ENABLE_SM90=1
# RUN --mount=type=cache,target=/root/.cache/pip \
# cd flashinfer \
# && export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} export FLASHINFER_ENABLE_SM90=${FLASHINFER_ENABLE_SM90} \
# && python -m flashinfer.aot \
# && python -m build --no-isolation --wheel
#
# # uv pip install \
# # --no-build-isolation "git+https://github.com/flashinfer-ai/[email protected]"
#
# RUN ls -al /workspace/flashinfer/flashinfer/dist

## Runtime #################################################################
FROM base AS runtime

Expand Down Expand Up @@ -227,20 +253,54 @@ RUN --mount=type=cache,target=/root/.cache/pip \
uv pip install pytest llnl-hatchet debugpy

# Install FlashInfer
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
# RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
# echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# RUN --mount=type=cache,target=/root/.cache/pip \
# . /etc/environment && \
# python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
# RUN --mount=type=cache,target=/root/.cache/pip \
# . /etc/environment && \
# uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
# RUN --mount=type=cache,target=/root/.cache/pip \
# uv pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/ --no-deps
# RUN --mount=type=cache,target=/root/.cache/pip \
# --mount=type=cache,target=/root/.cache/uv \
# uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.5/flashinfer_python-0.2.5+cu124torch2.6-cp38-abi3-linux_x86_64.whl#sha256=43d767b912c0c43a04be99595e0123eab9385fc72530a2874b5fb08e3145c0be
# RUN --mount=type=cache,target=/root/.cache/pip \
# --mount=type=cache,target=/root/.cache/uv \
# uv pip install torch==2.7.0
# RUN --mount=type=cache,target=/root/.cache/pip \
# --mount=type=cache,target=/root/.cache/uv \
# uv pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
# RUN mkdir /workspace/flashinfer_dist && ls -al /workspace/flashinfer_dist
# COPY --from=flashinfer-builder /workspace/*.whl /workspace/flashinfer_dist
# RUN --mount=type=cache,target=/root/.cache/pip \
# --mount=type=cache,target=/root/.cache/uv \
# uv pip install /workspace/flashinfer_dist/*.whl
# TODO: we need nvcc for flashinfer installation...custom build fails, see above
RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
RUN microdnf install -y \
cuda-nvcc-12-8 cuda-nvtx-12-8 cuda-libraries-devel-12-8 && \
microdnf clean all
ENV CUDA_HOME="/usr/local/cuda" \
PATH="${CUDA_HOME}/bin:${PATH}" \
LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
ENV TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
ENV FLASHINFER_ENABLE_SM90=1
RUN TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} FLASHINFER_ENABLE_SM90=${FLASHINFER_ENABLE_SM90} uv pip install \
--no-build-isolation "git+https://github.com/flashinfer-ai/[email protected]"


RUN ln -s ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_cupti/lib/libcupti.so.12 ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_cupti/lib/libcupti.so

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness && cd lm-evaluation-harness && uv pip install .

RUN git clone --depth 1 https://github.com/IBM/fmwork.git
# RUN git clone --depth 1 https://github.com/IBM/fmwork.git
# RUN git clone --depth 1 https://github.com/IBM/fmwork.git
COPY third_party/fmwork fmwork

ENV STORE_TEST_RESULT_PATH=/results

Expand All @@ -250,7 +310,7 @@ COPY vllm/tests tests
COPY ShareGPT_V3_unfiltered_cleaned_split.json ShareGPT_V3_unfiltered_cleaned_split.json

# Copy thid-party kernels and insert into path
COPY third_party third_party
COPY third_party/kernels third_party
ENV PYTHONPATH /workspace

# see https://github.com/IBM/triton-dejavu?tab=readme-ov-file#environment-variables
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ clean:

ifndef CI_ENABLED
format:
python -m black scripts ibm-triton-lib third_party
python -m black scripts ibm-triton-lib
else
format:
python -m black --check --verbose scripts ibm-triton-lib third_party
python -m black --check --verbose scripts ibm-triton-lib
endif

spelling:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
"total_bench_time_s": 4.903317928314209,
"evaluated_configs": 9,
"keys": [
"chunk_size",
"K",
"IS_CAUSAL"
],
"cache": {
"('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 32, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
0.007391999941319227
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
"total_bench_time_s": 10756.567904472351,
"evaluated_configs": 2625,
"keys": [
"chunk_size",
"K",
"IS_CAUSAL"
],
"cache": {
"('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
0.002230335958302021
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
"total_bench_time_s": 7.295067548751831,
"evaluated_configs": 7,
"keys": [
"chunk_size",
"nheads"
],
"cache": {
"('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 2, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
0.007071999832987785
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
"total_bench_time_s": 7.361271619796753,
"evaluated_configs": 7,
"keys": [
"chunk_size",
"nheads"
],
"cache": {
"('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 2, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
0.002133406000211835
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
"total_bench_time_s": 22.759257316589355,
"evaluated_configs": 11,
"keys": [
"chunk_size",
"hdim",
"dstate",
"IS_CAUSAL"
],
"cache": {
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": "BLOCK_SIZE_M: 32, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
0.014240000396966934
],
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": [
0.8048959970474243
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
"total_bench_time_s": 15278.822125434875,
"evaluated_configs": 2625,
"keys": [
"chunk_size",
"hdim",
"dstate",
"IS_CAUSAL"
],
"cache": {
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
0.014237518422305584
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
"total_bench_time_s": 5.0212812423706055,
"evaluated_configs": 9,
"keys": [
"hdim",
"dstate",
"chunk_size"
],
"cache": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
0.009247999638319016
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
"total_bench_time_s": 9348.028031349182,
"evaluated_configs": 2625,
"keys": [
"hdim",
"dstate",
"chunk_size"
],
"cache": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
0.003924777265638113
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
"total_bench_time_s": 0.0,
"evaluated_configs": 0,
"keys": null,
"cache": {},
"timings": {}
}
Loading