Skip to content

Commit 848d09e

Browse files
Ana MihajlovicAna Mihajlovic
authored andcommitted
update tests
1 parent 9688df2 commit 848d09e

File tree

2 files changed

+210
-123
lines changed

2 files changed

+210
-123
lines changed
Lines changed: 109 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,115 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck %s
3-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX12W32 %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GFX11W32 %s
44

5-
define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %a, i32 %b, i32 %c, i32 %d) {
6-
; CHECK-LABEL: divergent_i1_phi_if_else:
7-
; CHECK: ; %bb.0: ; %entry
8-
; CHECK-NEXT: v_cmp_le_u32_e64 s0, v3, v4
9-
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
10-
; CHECK-NEXT: s_mov_b32 s2, s0
11-
; CHECK-NEXT: s_and_saveexec_b32 s1, s0
12-
; CHECK-NEXT: ; %bb.1: ; %C
13-
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
14-
; CHECK-NEXT: s_and_not1_b32 s2, s0, exec_lo
15-
; CHECK-NEXT: s_and_b32 s3, vcc_lo, exec_lo
16-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17-
; CHECK-NEXT: s_or_b32 s2, s2, s3
18-
; CHECK-NEXT: ; %bb.2: ; %MergeCF
19-
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
20-
; CHECK-NEXT: s_nor_b32 s1, s0, s2
21-
; CHECK-NEXT: ; implicit-def: $sgpr0
22-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
23-
; CHECK-NEXT: s_and_saveexec_b32 s2, s1
24-
; CHECK-NEXT: s_xor_b32 s1, exec_lo, s2
25-
; CHECK-NEXT: ; %bb.3: ; %B
26-
; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
27-
; CHECK-NEXT: ; implicit-def: $vgpr2
28-
; CHECK-NEXT: ; %bb.4: ; %Flow
29-
; CHECK-NEXT: s_and_not1_saveexec_b32 s1, s1
30-
; CHECK-NEXT: ; %bb.5: ; %A
31-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
32-
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
33-
; CHECK-NEXT: s_and_not1_b32 s0, s0, exec_lo
34-
; CHECK-NEXT: s_and_b32 s2, vcc_lo, exec_lo
35-
; CHECK-NEXT: s_or_b32 s0, s0, s2
36-
; CHECK-NEXT: ; %bb.6: ; %exit
37-
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
38-
; CHECK-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0
39-
; CHECK-NEXT: global_store_b32 v[0:1], v2, off
40-
; CHECK-NEXT: s_endpgm
41-
entry:
42-
%x = icmp ule i32 %a, %b
43-
br i1 %x, label %C, label %MergeCF
44-
45-
C:
46-
%y = icmp eq i32 %a, %c
47-
br label %MergeCF
48-
49-
MergeCF:
50-
%z = phi i1 [ %x, %entry ], [ %y, %C ]
51-
%w = icmp ule i32 %a, %b
52-
%cmp = or i1 %w, %z
53-
br i1 %cmp, label %A, label %B
54-
55-
A:
56-
%val_A = icmp uge i32 %tid, 1
57-
br label %exit
5+
define amdgpu_ps i32 @test_w32(i32 %x, i32 %y) {
6+
; GFX12W32-LABEL: test_w32:
7+
; GFX12W32: ; %bb.0:
8+
; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
9+
; GFX12W32-NEXT: v_readfirstlane_b32 s1, v1
10+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
11+
; GFX12W32-NEXT: s_nor_b32 s0, s0, s1
12+
; GFX12W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
13+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
14+
; GFX12W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
15+
; GFX12W32-NEXT: s_wait_alu 0xf1ff
16+
; GFX12W32-NEXT: ; return to shader part epilog
17+
;
18+
; GFX11W32-LABEL: test_w32:
19+
; GFX11W32: ; %bb.0:
20+
; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
21+
; GFX11W32-NEXT: v_readfirstlane_b32 s1, v1
22+
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
23+
; GFX11W32-NEXT: s_nor_b32 s0, s0, s1
24+
; GFX11W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
25+
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
26+
; GFX11W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
27+
; GFX11W32-NEXT: ; return to shader part epilog
28+
%x.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %x)
29+
%y.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %y)
30+
%t = or i1 %x.b, %y.b
31+
%t.1 = xor i1 %t, -1
32+
%z = call i32 @llvm.amdgcn.ballot.i32(i1 %t.1)
33+
ret i32 %z
34+
}
5835

59-
B:
60-
%val_B = icmp ult i32 %tid, 2
61-
br label %exit
36+
define amdgpu_ps i32 @negative_test_w32(i32 %x, i32 %y) {
37+
; GFX12W32-LABEL: negative_test_w32:
38+
; GFX12W32: ; %bb.0:
39+
; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
40+
; GFX12W32-NEXT: v_readfirstlane_b32 s1, v1
41+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
42+
; GFX12W32-NEXT: s_or_b32 s0, s0, s1
43+
; GFX12W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
44+
; GFX12W32-NEXT: s_xor_b32 s0, s0, -1
45+
; GFX12W32-NEXT: s_wait_alu 0xfffe
46+
; GFX12W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
47+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
48+
; GFX12W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
49+
; GFX12W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
50+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
51+
; GFX12W32-NEXT: s_add_co_i32 s0, s0, vcc_lo
52+
; GFX12W32-NEXT: s_wait_alu 0xfffe
53+
; GFX12W32-NEXT: ; return to shader part epilog
54+
;
55+
; GFX11W32-LABEL: negative_test_w32:
56+
; GFX11W32: ; %bb.0:
57+
; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
58+
; GFX11W32-NEXT: v_readfirstlane_b32 s1, v1
59+
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
60+
; GFX11W32-NEXT: s_or_b32 s0, s0, s1
61+
; GFX11W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
62+
; GFX11W32-NEXT: s_xor_b32 s0, s0, -1
63+
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
64+
; GFX11W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
65+
; GFX11W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
66+
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
67+
; GFX11W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
68+
; GFX11W32-NEXT: s_add_i32 s0, s0, vcc_lo
69+
; GFX11W32-NEXT: ; return to shader part epilog
70+
%x.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %x)
71+
%y.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %y)
72+
%t = or i1 %x.b, %y.b
73+
%t.1 = xor i1 %t, -1
74+
%p.1 = xor i1 %t, -4
75+
%z = call i32 @llvm.amdgcn.ballot.i32(i1 %t.1)
76+
%q = call i32 @llvm.amdgcn.ballot.i32(i1 %p.1)
77+
%r = add i32 %z, %q
78+
ret i32 %r
79+
}
6280

63-
exit:
64-
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
65-
%sel = select i1 %phi, i32 1, i32 2
66-
store i32 %sel, ptr addrspace(1) %out
81+
define amdgpu_ps void @test_vgpr_w32(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %out) {
82+
; GFX12W32-LABEL: test_vgpr_w32:
83+
; GFX12W32: ; %bb.0:
84+
; GFX12W32-NEXT: v_or_b32_e32 v3, v3, v7
85+
; GFX12W32-NEXT: v_or_b32_e32 v2, v2, v6
86+
; GFX12W32-NEXT: v_or_b32_e32 v1, v1, v5
87+
; GFX12W32-NEXT: v_or_b32_e32 v0, v0, v4
88+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
89+
; GFX12W32-NEXT: v_not_b32_e32 v3, v3
90+
; GFX12W32-NEXT: v_not_b32_e32 v2, v2
91+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
92+
; GFX12W32-NEXT: v_not_b32_e32 v1, v1
93+
; GFX12W32-NEXT: v_not_b32_e32 v0, v0
94+
; GFX12W32-NEXT: global_store_b128 v[8:9], v[0:3], off
95+
; GFX12W32-NEXT: s_endpgm
96+
;
97+
; GFX11W32-LABEL: test_vgpr_w32:
98+
; GFX11W32: ; %bb.0:
99+
; GFX11W32-NEXT: v_or_b32_e32 v3, v3, v7
100+
; GFX11W32-NEXT: v_or_b32_e32 v2, v2, v6
101+
; GFX11W32-NEXT: v_or_b32_e32 v1, v1, v5
102+
; GFX11W32-NEXT: v_or_b32_e32 v0, v0, v4
103+
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
104+
; GFX11W32-NEXT: v_not_b32_e32 v3, v3
105+
; GFX11W32-NEXT: v_not_b32_e32 v2, v2
106+
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
107+
; GFX11W32-NEXT: v_not_b32_e32 v1, v1
108+
; GFX11W32-NEXT: v_not_b32_e32 v0, v0
109+
; GFX11W32-NEXT: global_store_b128 v[8:9], v[0:3], off
110+
; GFX11W32-NEXT: s_endpgm
111+
%p = or <4 x i32> %x, %y
112+
%q = xor <4 x i32> %p, <i32 -1, i32 -1, i32 -1, i32 -1>
113+
store <4 x i32> %q, ptr addrspace(1) %out
67114
ret void
68115
}
Lines changed: 101 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,108 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck %s
3-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX12W64 %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX11W64 %s
44

5-
define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %a, i32 %b, i32 %c, i32 %d) {
6-
; CHECK-LABEL: divergent_i1_phi_if_else:
7-
; CHECK: ; %bb.0: ; %entry
8-
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], v3, v4
9-
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
10-
; CHECK-NEXT: s_mov_b64 s[4:5], s[0:1]
11-
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
12-
; CHECK-NEXT: ; %bb.1: ; %C
13-
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
14-
; CHECK-NEXT: s_and_not1_b64 s[4:5], s[0:1], exec
15-
; CHECK-NEXT: s_and_b64 s[6:7], vcc, exec
16-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17-
; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
18-
; CHECK-NEXT: ; %bb.2: ; %MergeCF
19-
; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
20-
; CHECK-NEXT: s_nor_b64 s[2:3], s[0:1], s[4:5]
21-
; CHECK-NEXT: ; implicit-def: $sgpr0_sgpr1
22-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
23-
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
24-
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
25-
; CHECK-NEXT: ; %bb.3: ; %B
26-
; CHECK-NEXT: v_cmp_gt_u32_e64 s[0:1], 2, v2
27-
; CHECK-NEXT: ; implicit-def: $vgpr2
28-
; CHECK-NEXT: ; %bb.4: ; %Flow
29-
; CHECK-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3]
30-
; CHECK-NEXT: ; %bb.5: ; %A
31-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
32-
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
33-
; CHECK-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec
34-
; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
35-
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
36-
; CHECK-NEXT: ; %bb.6: ; %exit
37-
; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
38-
; CHECK-NEXT: v_cndmask_b32_e64 v2, 2, 1, s[0:1]
39-
; CHECK-NEXT: global_store_b32 v[0:1], v2, off
40-
; CHECK-NEXT: s_endpgm
41-
entry:
42-
%x = icmp ule i32 %a, %b
43-
br i1 %x, label %C, label %MergeCF
445

45-
C:
46-
%y = icmp eq i32 %a, %c
47-
br label %MergeCF
48-
49-
MergeCF:
50-
%z = phi i1 [ %x, %entry ], [ %y, %C ]
51-
%w = icmp ule i32 %a, %b
52-
%cmp = or i1 %w, %z
53-
br i1 %cmp, label %A, label %B
54-
55-
A:
56-
%val_A = icmp uge i32 %tid, 1
57-
br label %exit
6+
define amdgpu_ps i64 @test_w64(i64 inreg %x, i64 inreg %y) {
7+
; GFX12W64-LABEL: test_w64:
8+
; GFX12W64: ; %bb.0:
9+
; GFX12W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
10+
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11+
; GFX12W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
12+
; GFX12W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
13+
; GFX12W64-NEXT: s_wait_alu 0xf1ff
14+
; GFX12W64-NEXT: ; return to shader part epilog
15+
;
16+
; GFX11W64-LABEL: test_w64:
17+
; GFX11W64: ; %bb.0:
18+
; GFX11W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
19+
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
20+
; GFX11W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
21+
; GFX11W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
22+
; GFX11W64-NEXT: ; return to shader part epilog
23+
%x.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %x)
24+
%y.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %y)
25+
%t = or i1 %x.b, %y.b
26+
%t.1 = xor i1 %t, -1
27+
%z = call i64 @llvm.amdgcn.ballot.i64(i1 %t.1)
28+
ret i64 %z
29+
}
5830

59-
B:
60-
%val_B = icmp ult i32 %tid, 2
61-
br label %exit
31+
define amdgpu_ps i64 @negative_test_w64(i64 inreg %x, i64 inreg %y, ptr addrspace(1) %out) {
32+
; GFX12W64-LABEL: negative_test_w64:
33+
; GFX12W64: ; %bb.0:
34+
; GFX12W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
35+
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
36+
; GFX12W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
37+
; GFX12W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1
38+
; GFX12W64-NEXT: s_wait_alu 0xfffe
39+
; GFX12W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
40+
; GFX12W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
41+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
42+
; GFX12W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
43+
; GFX12W64-NEXT: s_add_nc_u64 s[0:1], s[0:1], vcc
44+
; GFX12W64-NEXT: s_wait_alu 0xfffe
45+
; GFX12W64-NEXT: ; return to shader part epilog
46+
;
47+
; GFX11W64-LABEL: negative_test_w64:
48+
; GFX11W64: ; %bb.0:
49+
; GFX11W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
50+
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
51+
; GFX11W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
52+
; GFX11W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1
53+
; GFX11W64-NEXT: s_waitcnt_depctr 0xfffe
54+
; GFX11W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
55+
; GFX11W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
56+
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
57+
; GFX11W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
58+
; GFX11W64-NEXT: s_add_u32 s0, vcc_lo, s0
59+
; GFX11W64-NEXT: s_waitcnt_depctr 0xfffe
60+
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
61+
; GFX11W64-NEXT: s_addc_u32 s1, vcc_hi, s1
62+
; GFX11W64-NEXT: ; return to shader part epilog
63+
%x.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %x)
64+
%y.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %y)
65+
%t = or i1 %x.b, %y.b
66+
%t.1 = xor i1 %t, -1
67+
%p.1 = xor i1 %t, -4
68+
%z = call i64 @llvm.amdgcn.ballot.i64(i1 %t.1)
69+
%q = call i64 @llvm.amdgcn.ballot.i64(i1 %p.1)
70+
%r = add i64 %z, %q
71+
ret i64 %r
72+
}
6273

63-
exit:
64-
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
65-
%sel = select i1 %phi, i32 1, i32 2
66-
store i32 %sel, ptr addrspace(1) %out
74+
define amdgpu_ps void @test_vgpr_w64(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %out) {
75+
; GFX12W64-LABEL: test_vgpr_w64:
76+
; GFX12W64: ; %bb.0:
77+
; GFX12W64-NEXT: v_or_b32_e32 v3, v3, v7
78+
; GFX12W64-NEXT: v_or_b32_e32 v2, v2, v6
79+
; GFX12W64-NEXT: v_or_b32_e32 v1, v1, v5
80+
; GFX12W64-NEXT: v_or_b32_e32 v0, v0, v4
81+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
82+
; GFX12W64-NEXT: v_not_b32_e32 v3, v3
83+
; GFX12W64-NEXT: v_not_b32_e32 v2, v2
84+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
85+
; GFX12W64-NEXT: v_not_b32_e32 v1, v1
86+
; GFX12W64-NEXT: v_not_b32_e32 v0, v0
87+
; GFX12W64-NEXT: global_store_b128 v[8:9], v[0:3], off
88+
; GFX12W64-NEXT: s_endpgm
89+
;
90+
; GFX11W64-LABEL: test_vgpr_w64:
91+
; GFX11W64: ; %bb.0:
92+
; GFX11W64-NEXT: v_or_b32_e32 v3, v3, v7
93+
; GFX11W64-NEXT: v_or_b32_e32 v2, v2, v6
94+
; GFX11W64-NEXT: v_or_b32_e32 v1, v1, v5
95+
; GFX11W64-NEXT: v_or_b32_e32 v0, v0, v4
96+
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
97+
; GFX11W64-NEXT: v_not_b32_e32 v3, v3
98+
; GFX11W64-NEXT: v_not_b32_e32 v2, v2
99+
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
100+
; GFX11W64-NEXT: v_not_b32_e32 v1, v1
101+
; GFX11W64-NEXT: v_not_b32_e32 v0, v0
102+
; GFX11W64-NEXT: global_store_b128 v[8:9], v[0:3], off
103+
; GFX11W64-NEXT: s_endpgm
104+
%p = or <4 x i32> %x, %y
105+
%q = xor <4 x i32> %p, <i32 -1, i32 -1, i32 -1, i32 -1>
106+
store <4 x i32> %q, ptr addrspace(1) %out
67107
ret void
68108
}

0 commit comments

Comments
 (0)