foundation-model-stack · bringlein · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,5 @@ ShareGPT_V3_unfiltered_cleaned_split.json
 
 .vscode/settings.json
 
+ibm-triton-lib/ibm_triton_lib.egg-info/
+
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "vllm"]
 	path = vllm
 	url = https://github.com/vllm-project/vllm.git
+[submodule "third_party/fmwork"]
+	path = third_party/fmwork
+	url = [email protected]:bringlein/fmwork.git
diff --git a/Dockerfile b/Dockerfile
@@ -3,6 +3,7 @@ ARG BASE_UBI_IMAGE_TAG=9.4
 ARG PYTHON_VERSION=3.12
 ARG MAX_JOBS=64
 ARG PIP_VLLM_VERSION=0.8.1
+# TODO add ARG CUDA_VERSION=12-8
 
 ARG VLLM_SOURCE=pip 
 # or VLLM_SOURCE=custom 
@@ -122,6 +123,31 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     python3 setup.py bdist_wheel --dist-dir=/workspace/
 
+# ## flashinfer Builder #################################################################
+# FROM vllm-builder_custom AS flashinfer-builder
+# ARG MAX_JOBS
+# 
+# # # build deps?
+# # RUN --mount=type=cache,target=/root/.cache/pip \
+# #     --mount=type=cache,target=/root/.cache/uv \
+# #     uv pip install ninja cmake wheel pybind11 setuptools
+# 
+# WORKDIR /workspace/flashinfer
+# RUN git clone --recursive https://github.com/flashinfer-ai/flashinfer.git
+# 
+# ENV TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
+# ENV FLASHINFER_ENABLE_SM90=1
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     cd flashinfer \
+#     && export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} export FLASHINFER_ENABLE_SM90=${FLASHINFER_ENABLE_SM90} \
+#     && python -m flashinfer.aot \
+#     && python -m build --no-isolation --wheel
+#     
+#     # uv pip install \
+#     # --no-build-isolation "git+https://github.com/flashinfer-ai/[email protected]"
+# 
+# RUN ls -al /workspace/flashinfer/flashinfer/dist
+
 ## Runtime #################################################################
 FROM base AS runtime
 
@@ -227,20 +253,54 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     uv pip install pytest llnl-hatchet debugpy
 
 # Install FlashInfer
-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+# RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+#     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     . /etc/environment && \
+#     python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     . /etc/environment && \
+#     uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     uv pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/ --no-deps
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     --mount=type=cache,target=/root/.cache/uv \
+#     uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.5/flashinfer_python-0.2.5+cu124torch2.6-cp38-abi3-linux_x86_64.whl#sha256=43d767b912c0c43a04be99595e0123eab9385fc72530a2874b5fb08e3145c0be
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     --mount=type=cache,target=/root/.cache/uv \
+#     uv pip install torch==2.7.0
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     --mount=type=cache,target=/root/.cache/uv \
+#     uv pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
+# RUN mkdir /workspace/flashinfer_dist && ls -al /workspace/flashinfer_dist
+# COPY --from=flashinfer-builder /workspace/*.whl /workspace/flashinfer_dist
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     --mount=type=cache,target=/root/.cache/uv \
+#     uv pip install /workspace/flashinfer_dist/*.whl
+# TODO: we need nvcc for flashinfer installation...custom build fails, see above
+RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+RUN microdnf install -y \
+        cuda-nvcc-12-8 cuda-nvtx-12-8 cuda-libraries-devel-12-8 && \
+    microdnf clean all
+ENV CUDA_HOME="/usr/local/cuda" \
+    PATH="${CUDA_HOME}/bin:${PATH}" \
+    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
+ENV TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
+ENV FLASHINFER_ENABLE_SM90=1
+RUN TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} FLASHINFER_ENABLE_SM90=${FLASHINFER_ENABLE_SM90} uv pip install \
+    --no-build-isolation "git+https://github.com/flashinfer-ai/[email protected]"
 
+
 RUN ln -s ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_cupti/lib/libcupti.so.12  ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_cupti/lib/libcupti.so
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
     git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness && cd lm-evaluation-harness && uv pip install .
 
-RUN git clone --depth 1 https://github.com/IBM/fmwork.git
+# RUN git clone --depth 1 https://github.com/IBM/fmwork.git
+# RUN git clone --depth 1 https://github.com/IBM/fmwork.git
+COPY third_party/fmwork fmwork
 
 ENV STORE_TEST_RESULT_PATH=/results
 
@@ -250,7 +310,7 @@ COPY vllm/tests tests
 COPY ShareGPT_V3_unfiltered_cleaned_split.json ShareGPT_V3_unfiltered_cleaned_split.json
 
 # Copy thid-party kernels and insert into path
-COPY third_party third_party
+COPY third_party/kernels third_party
 ENV PYTHONPATH /workspace
 
 # see https://github.com/IBM/triton-dejavu?tab=readme-ov-file#environment-variables

diff --git a/Makefile b/Makefile
@@ -60,10 +60,10 @@ clean:
 
 ifndef CI_ENABLED
 format:
-	python -m black scripts ibm-triton-lib third_party
+	python -m black scripts ibm-triton-lib
 else
 format:
-	python -m black --check --verbose scripts ibm-triton-lib third_party
+	python -m black --check --verbose scripts ibm-triton-lib
 endif
 
 spelling:

diff --git a/...nfigs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/...nfigs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
+    "total_bench_time_s": 4.903317928314209,
+    "evaluated_configs": 9,
+    "keys": [
+        "chunk_size",
+        "K",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 32, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
+            0.007391999941319227
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
diff --git a/...nfigs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/...nfigs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
+    "total_bench_time_s": 10756.567904472351,
+    "evaluated_configs": 2625,
+    "keys": [
+        "chunk_size",
+        "K",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
+            0.002230335958302021
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
diff --git a/...nfigs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json b/...nfigs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
@@ -0,0 +1,25 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
+    "total_bench_time_s": 7.295067548751831,
+    "evaluated_configs": 7,
+    "keys": [
+        "chunk_size",
+        "nheads"
+    ],
+    "cache": {
+        "('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 2, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
+            0.007071999832987785
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
diff --git a/...nfigs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json b/...nfigs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
@@ -0,0 +1,25 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
+    "total_bench_time_s": 7.361271619796753,
+    "evaluated_configs": 7,
+    "keys": [
+        "chunk_size",
+        "nheads"
+    ],
+    "cache": {
+        "('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 2, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
+            0.002133406000211835
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
diff --git a/...nfigs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json b/...nfigs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
@@ -0,0 +1,31 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
+    "total_bench_time_s": 22.759257316589355,
+    "evaluated_configs": 11,
+    "keys": [
+        "chunk_size",
+        "hdim",
+        "dstate",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": "BLOCK_SIZE_M: 32, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
+            0.014240000396966934
+        ],
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": [
+            0.8048959970474243
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
diff --git a/...nfigs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/...nfigs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,27 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
+    "total_bench_time_s": 15278.822125434875,
+    "evaluated_configs": 2625,
+    "keys": [
+        "chunk_size",
+        "hdim",
+        "dstate",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
+            0.014237518422305584
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
diff --git a/...nfigs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/...nfigs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
+    "total_bench_time_s": 5.0212812423706055,
+    "evaluated_configs": 9,
+    "keys": [
+        "hdim",
+        "dstate",
+        "chunk_size"
+    ],
+    "cache": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
+            0.009247999638319016
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
diff --git a/...nfigs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/...nfigs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
+    "total_bench_time_s": 9348.028031349182,
+    "evaluated_configs": 2625,
+    "keys": [
+        "hdim",
+        "dstate",
+        "chunk_size"
+    ],
+    "cache": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
+            0.003924777265638113
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
diff --git a/...nfigs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/...nfigs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,3 +12,5 @@ ShareGPT_V3_unfiltered_cleaned_split.json

		.vscode/settings.json

		ibm-triton-lib/ibm_triton_lib.egg-info/