Skip to content

Commit e3350a6

Browse files
authored
[AMDGPU] InstCombine llvm.amdgcn.ds.bpermute with uniform arguments (#130133)
Reland #129895 with a fix to avoid trying to combine bpermute of bitcast.
1 parent cdb9c61 commit e3350a6

File tree

3 files changed

+74
-4
lines changed

3 files changed

+74
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

+23-4
Original file line numberDiff line numberDiff line change
@@ -1161,9 +1161,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
11611161
}
11621162
case Intrinsic::amdgcn_permlane64:
11631163
case Intrinsic::amdgcn_readfirstlane:
1164-
case Intrinsic::amdgcn_readlane: {
1165-
// If the first argument is uniform these intrinsics return it unchanged.
1166-
const Use &Src = II.getArgOperandUse(0);
1164+
case Intrinsic::amdgcn_readlane:
1165+
case Intrinsic::amdgcn_ds_bpermute: {
1166+
// If the data argument is uniform these intrinsics return it unchanged.
1167+
unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1168+
const Use &Src = II.getArgOperandUse(SrcIdx);
11671169
if (isTriviallyUniform(Src))
11681170
return IC.replaceInstUsesWith(II, Src.get());
11691171

@@ -1172,7 +1174,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
11721174
return ⅈ
11731175

11741176
// readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
1175-
if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse()) {
1177+
if (auto *BC = dyn_cast<BitCastInst>(Src);
1178+
BC && BC->hasOneUse() && IID != Intrinsic::amdgcn_ds_bpermute) {
11761179
Value *BCSrc = BC->getOperand(0);
11771180

11781181
// TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
@@ -1195,6 +1198,22 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
11951198
}
11961199
}
11971200

1201+
// If the lane argument of bpermute is uniform, change it to readlane. This
1202+
// generates better code and can enable further optimizations because
1203+
// readlane is AlwaysUniform.
1204+
if (IID == Intrinsic::amdgcn_ds_bpermute) {
1205+
const Use &Lane = II.getArgOperandUse(0);
1206+
if (isTriviallyUniform(Lane)) {
1207+
Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1208+
Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1209+
II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1210+
II.setCalledFunction(NewDecl);
1211+
II.setOperand(0, Src);
1212+
II.setOperand(1, NewLane);
1213+
return &II;
1214+
}
1215+
}
1216+
11981217
return std::nullopt;
11991218
}
12001219
case Intrinsic::amdgcn_writelane: {

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

+39
Original file line numberDiff line numberDiff line change
@@ -6775,3 +6775,42 @@ define i32 @prng_poison_i32() {
67756775
%prng = call i32 @llvm.amdgcn.prng.b32(i32 poison)
67766776
ret i32 %prng
67776777
}
6778+
6779+
; --------------------------------------------------------------------
6780+
; llvm.amdgcn.ds.bpermute
6781+
; --------------------------------------------------------------------
6782+
6783+
define amdgpu_kernel void @ds_bpermute_uniform_src(ptr addrspace(1) %out, i32 %lane) {
6784+
; CHECK-LABEL: @ds_bpermute_uniform_src(
6785+
; CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT:%.*]], align 4
6786+
; CHECK-NEXT: ret void
6787+
;
6788+
%v = call i32 @llvm.amdgcn.ds.bpermute(i32 %lane, i32 7)
6789+
store i32 %v, ptr addrspace(1) %out
6790+
ret void
6791+
}
6792+
6793+
define amdgpu_kernel void @ds_bpermute_constant_lane(ptr addrspace(1) %out, i32 %src) {
6794+
; CHECK-LABEL: @ds_bpermute_constant_lane(
6795+
; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC:%.*]], i32 7)
6796+
; CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT:%.*]], align 4
6797+
; CHECK-NEXT: ret void
6798+
;
6799+
%v = call i32 @llvm.amdgcn.ds.bpermute(i32 28, i32 %src)
6800+
store i32 %v, ptr addrspace(1) %out
6801+
ret void
6802+
}
6803+
6804+
define amdgpu_kernel void @ds_bpermute_uniform_lane(ptr addrspace(1) %out, i32 %lanearg, i32 %src) {
6805+
; CHECK-LABEL: @ds_bpermute_uniform_lane(
6806+
; CHECK-NEXT: [[LANE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[LANEARG:%.*]])
6807+
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[LANE]], 2
6808+
; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC:%.*]], i32 [[TMP1]])
6809+
; CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT:%.*]], align 4
6810+
; CHECK-NEXT: ret void
6811+
;
6812+
%lane = call i32 @llvm.amdgcn.readfirstlane(i32 %lanearg)
6813+
%v = call i32 @llvm.amdgcn.ds.bpermute(i32 %lane, i32 %src)
6814+
store i32 %v, ptr addrspace(1) %out
6815+
ret void
6816+
}

llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll

+12
Original file line numberDiff line numberDiff line change
@@ -311,3 +311,15 @@ define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken(float %val, i32 in
311311
%result = call i32 @llvm.amdgcn.readlane.i32(i32 %bitcast, i32 %lane.index) [ "convergencectrl"(token %t) ]
312312
ret i32 %result
313313
}
314+
315+
define i32 @test_bitcast_f32_to_i32_ds_bpermute(float %val, i32 %addr) {
316+
; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_ds_bpermute(
317+
; CHECK-SAME: float [[VAL:%.*]], i32 [[ADDR:%.*]]) #[[ATTR0]] {
318+
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
319+
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[ADDR]], i32 [[BITCAST]])
320+
; CHECK-NEXT: ret i32 [[RESULT]]
321+
;
322+
%bitcast = bitcast float %val to i32
323+
%result = call i32 @llvm.amdgcn.ds.bpermute(i32 %addr, i32 %bitcast)
324+
ret i32 %result
325+
}

0 commit comments

Comments
 (0)