intel
diff --git a/‎.github/scripts/microbench_summary.sh
+203 b/‎.github/scripts/microbench_summary.sh
+203
diff --git a/‎.github/workflows/_linux_op_benchmark.yml
+128 b/‎.github/workflows/_linux_op_benchmark.yml
+128
diff --git a/‎.github/workflows/nightly_ondemand_rolling.yml
+12 b/‎.github/workflows/nightly_ondemand_rolling.yml
+12
@@ -0,0 +1,203 @@
+#! /bin/bash
+# This script is for op perf summary, both for forward and backward op
+
+# usage
+# Summary forward op time, forward_op_summary.csv is forward summary file
+## bash microbench_summary.sh path/to/profile's log forward_op_summary.csv
+# Summary backward op time, backward_op_summary.csv is backward summary file, True means summary backward, default is false.
+## bash microbench_summary.sh path/to/profile's log backward_op_summary.csv True
+
+results_dir="$1"
+output_file="$2"
+Get_backward=${3:-False}
+cd "$results_dir" || exit
+
+echo "case_name;datatype;op_name;shape;channels_last;dim;output_size;P;reduce;kernel_size;stride;replacement;num_samples;scale_factor;mode;padding_mode;align_corners;shifts;affine;backward;time(us)" >> "$output_file"
+
+function op_summary {
+    while IFS= read -r line1 && IFS= read -r line2 <&3; do
+        text=${line1}
+        IFS=';' read -ra pairs <<< "$(echo "$text" | tr -d '\n' | tr -s ' ')"
+        for pair in "${pairs[@]}"; do
+            IFS=':' read -r key value <<< "$pair"
+            key=$(echo "$key" | xargs)
+            value=$(echo "$value" | xargs)
+            if [[ shape = "$key" ]] ; then
+                shape=${value}
+            fi
+            if [[ datatype = "$key" ]] ; then
+                datatype=${value}
+            fi
+            if [[ dim = "$key" ]] || [[ dims = "$key" ]] ; then
+                dim=${value}
+            fi
+            if [[ output_size = "$key" ]] ; then
+                output_size=${value}
+            fi
+            if [[ channels_last = "$key" ]] ; then
+                channels_last=${value}
+            fi
+            if [[ backward = "$key" ]] ; then
+                backward=${value}
+            fi
+            if [[ reduce = "$key" ]] ; then
+                reduce=${value}
+            fi
+            if [[ kernel_size = "$key" ]] ; then
+                kernel_size=${value}
+            fi
+            if [[ P = "$key" ]] ; then
+                P=${value}
+            fi
+            if [[ stride = "$key" ]] ; then
+                stride=${value}
+            fi
+            if [[ replacement = "$key" ]] ; then
+                replacement=${value}
+            fi
+            if [[ num_samples = "$key" ]] ; then
+                num_samples=${value}
+            fi
+            if [[ scale_factor = "$key" ]] ; then
+                scale_factor=${value}
+            fi
+            if [[ mode = "$key" ]] ; then
+                mode=${value}
+            fi
+            if [[ padding_mode = "$key" ]] ; then
+                padding_mode=${value}
+            fi
+            if [[ align_corners = "$key" ]] ; then
+                align_corners=${value}
+            fi
+            if [[ affine = "$key" ]] ; then
+                affine=${value}
+            fi
+            if [[ shifts = "$key" ]] ; then
+                shifts=${value}
+            fi
+        done
+        number=""
+        if [[ $line2 =~ ^([0-9.]+)([a-zA-Z]+)$ ]] ; then
+            number="${BASH_REMATCH[1]}"
+            unit="${BASH_REMATCH[2]}"
+        fi
+        # Align the time units
+        if [[ $unit == "ms" ]] ;then
+           number=$(echo "scale=3; $number * 1000" | bc)
+        fi
+        if [[ $unit == "s" ]] ;then
+           number=$(echo "scale=3; $number * 1000000" | bc)
+        fi
+        if [[ $Get_backward == "True" ]] && [[ $backward == "False" ]]; then
+            echo "Only Forward"
+        else
+            echo "${i%.*};${datatype};${op_name};$shape;$channels_last;$dim;$output_size;$P;$reduce;$kernel_size;$stride;$replacement;$num_samples;$scale_factor;$mode;$padding_mode;$align_corners;$shifts;$affine;$backward;$number" >> "$output_file"
+        fi
+    done < <(echo "$texts") 3< <(echo "$times")
+}
+
+filename=$(find -- *.log)
+
+for i in $filename
+do
+    output_size=""
+    P=""
+    channels_last=""
+    dim=""
+    backward=""
+    reduce=""
+    kernel_size=""
+    affine=""
+    output_size=""
+    stride=""
+    replacement=""
+    num_samples=""
+    scale_factor=""
+    mode=""
+    padding_mode=""
+    align_corners=""
+    shifts=""
+    case_name="${i%.*}"
+    op_name=$(echo "$case_name" | awk -F. '{print $NF}')
+    if [[ $Get_backward == "False" ]] ; then
+        if [[ $op_name =~ batch_norm ]] ; then
+            op_name="aten::batch_norm"
+            times=$(grep -E "${op_name}" "${i}" | awk  '{print $10}')
+        elif [[ $op_name =~ exponential ]] || [[ $op_name =~ geometric ]] || [[ $op_name =~ uniform ]] || [[ $op_name =~ random ]] || [[ $op_name =~ normal ]] || [[ $op_name =~ log_normal ]] || [[ $op_name =~ bernoulli ]] || [[ $op_name =~ cauchy ]] ;then
+            op_name=$op_name"_"
+            times=$(grep -E "${op_name}" "${i}" | awk  '{print $10}')
+        elif [[ $op_name == unique ]] ; then
+            op_name="unique2"
+            times=$(grep -E "${op_name}" "${i}" | awk  '{print $10}')
+        elif [[ $op_name == max_pool3d ]] || [[ $op_name == max_pool2d ]] ; then
+            op_name=$op_name"_with_indices"
+            times=$(grep -E "${op_name} " "${i}" | awk  '{print $10}')
+        elif [[ $op_name == dropout ]] || [[ $op_name == layer_norm ]] ; then
+            times=$(grep -w "${op_name}" "${i}" | awk  '{print $10}')
+        elif [[ $op_name == ctc_loss ]] ; then
+            op_name="_"$op_name
+            times=$(grep -w "${op_name}" "${i}" | awk  '{print $10}')
+        elif [[ $op_name == adaptive_avg_pool2d ]] ; then
+            op_name="adaptive_avg_pool2d"
+            times=$(grep -w "${op_name} " "${i}" | awk  '{print $10}')
+        elif [[ $op_name == softmax ]] ; then
+            op_name="aten::softmax"
+            times=$(grep -E "${op_name}" "${i}" | awk  '{print $10}')
+        elif [[ $op_name == group_norm ]] ; then
+            op_name="aten::group_norm"
+            times=$(grep -E "${op_name}" "${i}" | awk  '{print $10}')
+        else
+            times=$(grep -E "${op_name} " "${i}" | awk  '{print $10}')
+        fi
+    else
+        if [[ $op_name =~ batch_norm ]] ; then
+            op_name="batch_norm_backward"
+            times=$(grep -E "${op_name}" "${i}" | awk  '{print $10}')
+        elif [[ $op_name == max_pool3d ]] || [[ $op_name == max_pool2d ]] ; then
+            op_name=$op_name"_with_indices_backward"
+            times=$(grep -E "${op_name} " "${i}" | awk  '{print $10}')
+        elif [[ $op_name == col2im ]] ; then
+            op_name="Col2ImBackward0"
+            times=$(grep -E "${op_name} " "${i}" | grep -v "autograd::engine" | awk  '{print $10}')
+        elif [[ $op_name == im2col ]] ; then
+            op_name="Im2ColBackward0"
+            times=$(grep -E "${op_name} " "${i}" | grep -v "autograd::engine" | awk  '{print $10}')
+        elif [[ $op_name == flip ]] ; then
+            op_name="FlipBackward0"
+            times=$(grep -E "${op_name} " "${i}" | grep -v "autograd::engine" | awk  '{print $10}')
+        elif [[ $op_name == matmul ]] ; then
+            op_name="MmBackward0"
+            times=$(grep -E "${op_name} " "${i}" | grep -v "autograd::engine" | awk  '{print $10}')
+        elif [[ $op_name == roll ]] ; then
+            op_name="RollBackward0"
+            times=$(grep -E "${op_name} " "${i}" | grep -v "autograd::engine" | awk  '{print $10}')
+        elif [[ $op_name == softmax ]] ; then
+            op_name=$op_name"_backward_data"
+            times=$(grep -E "${op_name} " "${i}" | awk  '{print $10}')
+        elif [[ $op_name == remainder ]] ; then
+            op_name="RemainderBackward0"
+            times=$(grep -E "${op_name} " "${i}" | awk  '{print $10}')
+        elif [[ $op_name == l1_loss ]] ; then
+            op_name="l1_loss"
+        else
+            op_name=$op_name"_backward"
+            times=$(grep -E "${op_name} " "${i}" | awk  '{print $10}')
+        fi
+    fi
+
+    texts=$(grep -E "shape :|shape:" "$i")
+    number=""
+    if [[ $op_name == l1_loss ]] && [[ $Get_backward == "True" ]] ; then
+        op_name="AbsBackward0"
+        times=$(grep -E "${op_name} " "${i}" | grep -v "autograd" | awk  '{print $10}' | head -n 6)
+        texts=$(grep -E "shape :|shape:" "$i" | head -n 6)
+        op_summary
+        op_name="MeanBackward0"
+        times=$(grep -E "${op_name} " "${i}" | grep -v "autograd" | awk  '{print $10}')
+        texts=$(grep -E "shape :|shape:" "$i" | tail -n 6)
+        op_summary
+    else
+        op_summary
+    fi
+done
@@ -0,0 +1,128 @@
+name: Linux OP Benchmark Test
+
+on:
+  workflow_call:
+    inputs:
+      pytorch:
+        required: false
+        type: string
+        default: 'main'
+        description: Pytorch branch/commit
+      keep_torch_xpu_ops:
+        required: false
+        type: string
+        default: 'false'
+        description: Keep torch-xpu-ops pin. `true` means use pined commit
+      triton:
+        required: false
+        type: string
+        default: ''
+        description: Triton commit. Use pytorch pined commit by default
+      python:
+        required: false
+        type: string
+        default: '3.10'
+        description: Python version
+      runner:
+        required: true
+        type: string
+        default: 'linux.idc.xpu'
+        description: Runner label
+      driver:
+        required: false
+        type: string
+        default: 'rolling'
+        description: Driver lts/rolling
+
+permissions: read-all
+
+jobs:
+  op_benchmark_test:
+    runs-on: ${{ inputs.runner }} 
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/pytorch/pytorch pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Op Benchmark
+        if: ${{ inputs.driver == 'rolling' }} 
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          mkdir -p ${{ github.workspace }}/op_benchmark
+          cd test/microbench
+          filename=$(find -- *.py)
+          for i in $filename
+          do
+            python ${i%.*}.py > ${{ github.workspace }}/op_benchmark/${i%.*}.log
+          done
+          # Summary forward op time
+          bash ${{ github.workspace }}/.github/scripts/microbench_summary.sh ${{ github.workspace }}/op_benchmark ${{ github.workspace }}/op_benchmark/forward_op_summary.csv
+          # Summary backward op time
+          bash ${{ github.workspace }}/.github/scripts/microbench_summary.sh ${{ github.workspace }}/op_benchmark ${{ github.workspace }}/op_benchmark/backward_op_summary.csv True
+      - name: Upload Inductor XPU OP benchmark Log
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-OP-Benchmark-Data-${{ github.event.pull_request.number || github.sha }}
+          path: ${{ github.workspace }}/op_benchmark
@@ -95,6 +95,18 @@ jobs:
       triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
       driver: rolling
       runner: pvc_rolling
+  
+  Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling:
+    name: linux-nightly-ondemand-rolling / Op_microbench
+    needs: Linux-Nightly-Ondemand-Build-Rolling
+    uses: ./.github/workflows/_linux_op_benchmark.yml
+    with:
+      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
+      pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }}
+      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
+      triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
+      driver: rolling
+      runner: pvc_rolling
 
   Linux-Nightly-Ondemand-E2E-Tests-Rolling:
     runs-on: pvc_rolling