Skip to content

Commit f2ea067

Browse files
authored
update ainic bundle to 56. disable hipblaslt tuning (#563)
Updates the AINIC Docker build inputs and adjusts the pretrain launcher to disable HipBLASLt tuning by default (to avoid profiler/TE issues), while also extending CI to build an additional v25.09 AINIC image variant. Changes: - Disable HipBLASLt tuning by default in run_pretrain.sh, requiring an explicit opt-in env var to enable it. - Bump the AINIC bundle used by the AINIC Docker image from a-38 to a-56. - Update CI to use the new bundle and add a new -v25.09-ainic image build/push step.
1 parent 02ca70d commit f2ea067

File tree

4 files changed

+51
-15
lines changed

4 files changed

+51
-15
lines changed

.github/workflows/ci.yaml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,7 @@ jobs:
114114
echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-ainic"
115115
start_time=$(date +%s)
116116
mkdir -p $GITHUB_WORKSPACE/.github/workflows/docker/ainic
117-
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-38.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
118-
cp /apps/tas/0_public/primus_docker_ci/ainic/amd-anp-v1.3.0.patch $GITHUB_WORKSPACE/.github/workflows/docker/ainic/
117+
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
119118
docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile.ainic \
120119
--network=host \
121120
-t tasimage/primus:${{env.IMAGE_TAG}}-ainic \
@@ -131,6 +130,25 @@ jobs:
131130
docker push docker.io/tasimage/primus:${{env.IMAGE_TAG}}-ainic
132131
docker login -u rocmshared -p ${{ secrets.ROCM_DOCKER_HUB_TOKEN }}
133132
133+
echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-v25.09-ainic"
134+
start_time=$(date +%s)
135+
mkdir -p $GITHUB_WORKSPACE/.github/workflows/docker/ainic
136+
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
137+
docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile_v25.09_ainic \
138+
--network=host \
139+
-t tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic \
140+
--build-arg AINIC_BUNDLE_PATH=ainic \
141+
--build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \
142+
$GITHUB_WORKSPACE/.github/workflows/docker
143+
end_time=$(date +%s)
144+
elapsed=$((end_time - start_time))
145+
echo "⏱️ [build primus docker-v25.09-ainic] Total elapsed time: ${elapsed} seconds"
146+
147+
docker tag tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic docker.io/tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic
148+
docker login -u tasimage -p ${{ secrets.PRIMUS_DOCKER_HUB_TOKEN }}
149+
docker push docker.io/tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic
150+
docker login -u rocmshared -p ${{ secrets.ROCM_DOCKER_HUB_TOKEN }}
151+
134152
echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-jax"
135153
start_time=$(date +%s)
136154
docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile \

.github/workflows/docker/Dockerfile.ainic

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,16 @@ ENV MPI_PATH=/opt/ompi
3030
# WARNING: If these paths are missing, tools and libraries may not function correctly.
3131
# INFO: Installation completed successfully
3232

33-
COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-38.tar.gz ${WORKDIR}
33+
COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-56.tar.gz ${WORKDIR}
3434
RUN cd ${WORKDIR} && \
3535
echo "Building ainic bundle... current directory: ${WORKDIR}" && \
36-
tar zxf ainic_bundle_1.117.5-a-38.tar.gz && \
37-
cd ainic_bundle_1.117.5-a-38 && \
36+
tar zxf ainic_bundle_1.117.5-a-56.tar.gz && \
37+
cd ainic_bundle_1.117.5-a-56 && \
3838
tar zxf host_sw_pkg.tar.gz && \
3939
cd host_sw_pkg && \
40-
./install.sh --domain=user -y 2>&1 | tee log_install.txt
40+
./install.sh --domain=user -y 2>&1 | tee log_install.txt && \
41+
cd ${WORKDIR} && \
42+
apt-get install -y ./amd/ainic/deb-repo/libionic*.deb
4143

4244
# ---------------------------------------------------------------------------
4345
# Build rccl
@@ -54,8 +56,9 @@ ENV RCCL_HOME=${WORKDIR}/rccl
5456
# Build AMD ANP
5557
# ---------------------------------------------------------------------------
5658

57-
RUN apt-get install -y --allow-unauthenticated libionic-dev && \
58-
cd ${WORKDIR} && git clone https://github.com/rocm/amd-anp.git && \
59+
# RUN apt-get install -y --allow-unauthenticated libionic-dev && \
60+
RUN cd ${WORKDIR} && \
61+
git clone https://github.com/rocm/amd-anp.git && \
5962
cd amd-anp && git checkout tags/v1.3.0 && \
6063
make -j 16 RCCL_HOME=${RCCL_HOME} \
6164
MPI_INCLUDE=${MPI_PATH}/include/ \

.github/workflows/docker/Dockerfile_v25.09_ainic

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,16 @@ RUN apt-get update && \
3636
# WARNING: If these paths are missing, tools and libraries may not function correctly.
3737
# INFO: Installation completed successfully
3838

39-
COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.1-a-42.tar.gz /opt/
39+
COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-56.tar.gz ${WORKDIR}
4040
RUN cd ${WORKDIR} && \
4141
echo "Building ainic bundle... current directory: ${WORKDIR}" && \
42-
tar zxf ainic_bundle_1.117.1-a-42.tar.gz && \
43-
cd ainic_bundle_1.117.1-a-42 && \
42+
tar zxf ainic_bundle_1.117.5-a-56.tar.gz && \
43+
cd ainic_bundle_1.117.5-a-56 && \
4444
tar zxf host_sw_pkg.tar.gz && \
4545
cd host_sw_pkg && \
4646
./install.sh --domain=user -y 2>&1 | tee log_install.txt && \
47-
cd /opt
47+
cd ${WORKDIR} && \
48+
apt-get install -y ./amd/ainic/deb-repo/libionic*.deb
4849

4950
# =============================== Test AINIC Driver ===============================
5051
# ibv_devices

examples/run_pretrain.sh

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,15 @@ if [ "$USING_AINIC" == "1" ]; then
178178
export ANP_HOME_DIR=${ANP_HOME_DIR:-"/opt/amd-anp"}
179179
export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/opt/rccl"}
180180
export MPI_HOME_DIR=${MPI_HOME_DIR:-"/opt/ompi"}
181-
export NCCL_NET_PLUGIN=librccl-anp.so
181+
# Check which NCCL net plugin library is present under ${ANP_HOME_DIR}/build and set accordingly
182+
if [ -f "${ANP_HOME_DIR}/build/librccl-anp.so" ]; then
183+
export NCCL_NET_PLUGIN=librccl-anp.so
184+
elif [ -f "${ANP_HOME_DIR}/build/librccl-net.so" ]; then
185+
export NCCL_NET_PLUGIN=librccl-net.so
186+
else
187+
LOG_ERROR "Error: Neither librccl-anp.so nor librccl-net.so found in ${ANP_HOME_DIR}/build."
188+
exit 1
189+
fi
182190

183191
LOG_INFO_RANK0 "Using AINIC"
184192
LOG_INFO_RANK0 "RCCL_HOME_DIR: $RCCL_HOME_DIR"
@@ -490,11 +498,17 @@ handle_hipblaslt_tuning() {
490498
fi
491499
}
492500

493-
# Disable HipBLASLT tuning in deterministic mode
494-
if [ "${PRIMUS_DETERMINISTIC:-}" != "1" ]; then
501+
# NOTE: Disable HipBLASLT tuning in deterministic mode
502+
# NOTE: If you need to enable torch profiler, do NOT enable HipBLASLT tuning.
503+
if [ "${PRIMUS_DETERMINISTIC:-}" != "1" ] && [ "${PRIMUS_HIPBLASLT_TUNING:-0}" = "1" ]; then
495504
handle_hipblaslt_tuning
505+
else
506+
LOG_INFO "disable hipblaslt tuning by default to fix torch profiler issue in TE"
507+
export TE_HIPBLASLT_TUNING_RUN_COUNT=0
508+
export TE_HIPBLASLT_TUNING_ALGO_COUNT=0
496509
fi
497510

511+
498512
# -------------------- Python Path Setup --------------------
499513
setup_pythonpath() {
500514
local site_packages

0 commit comments

Comments
 (0)