update tests

Ana Mihajlovic · Ana Mihajlovic · commit 848d09eb8697 · 2025-03-25T09:48:52.000+01:00
diff --git a/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll b/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
@@ -1,68 +1,115 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX12W32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GFX11W32 %s
 
-define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32  %tid, i32  %a, i32  %b, i32  %c, i32  %d) {
-; CHECK-LABEL: divergent_i1_phi_if_else:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    v_cmp_le_u32_e64 s0, v3, v4
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_and_saveexec_b32 s1, s0
-; CHECK-NEXT:  ; %bb.1: ; %C
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; CHECK-NEXT:    s_and_not1_b32 s2, s0, exec_lo
-; CHECK-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_or_b32 s2, s2, s3
-; CHECK-NEXT:  ; %bb.2: ; %MergeCF
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; CHECK-NEXT:    s_nor_b32 s1, s0, s2
-; CHECK-NEXT:    ; implicit-def: $sgpr0
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_saveexec_b32 s2, s1
-; CHECK-NEXT:    s_xor_b32 s1, exec_lo, s2
-; CHECK-NEXT:  ; %bb.3: ; %B
-; CHECK-NEXT:    v_cmp_gt_u32_e64 s0, 2, v2
-; CHECK-NEXT:    ; implicit-def: $vgpr2
-; CHECK-NEXT:  ; %bb.4: ; %Flow
-; CHECK-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; CHECK-NEXT:  ; %bb.5: ; %A
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 s0, s0, exec_lo
-; CHECK-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
-; CHECK-NEXT:    s_or_b32 s0, s0, s2
-; CHECK-NEXT:  ; %bb.6: ; %exit
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
-; CHECK-NEXT:    global_store_b32 v[0:1], v2, off
-; CHECK-NEXT:    s_endpgm
-entry:
-  %x = icmp ule i32 %a, %b
-  br i1 %x, label %C, label %MergeCF
-
-C:
-  %y = icmp eq i32 %a, %c
-  br label %MergeCF
-
-MergeCF:
-  %z = phi i1 [ %x, %entry ], [ %y, %C ]
-  %w = icmp ule i32 %a, %b
-  %cmp = or i1 %w, %z
-  br i1 %cmp, label %A, label %B
-
-A:
-  %val_A = icmp uge i32 %tid, 1
-  br label %exit
+define amdgpu_ps i32 @test_w32(i32 %x, i32 %y) {
+; GFX12W32-LABEL: test_w32:
+; GFX12W32:       ; %bb.0:
+; GFX12W32-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12W32-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_nor_b32 s0, s0, s1
+; GFX12W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; GFX12W32-NEXT:    s_wait_alu 0xf1ff
+; GFX12W32-NEXT:    ; return to shader part epilog
+;
+; GFX11W32-LABEL: test_w32:
+; GFX11W32:       ; %bb.0:
+; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    s_nor_b32 s0, s0, s1
+; GFX11W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; GFX11W32-NEXT:    ; return to shader part epilog
+  %x.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %x)
+  %y.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %y)
+  %t = or i1 %x.b, %y.b
+  %t.1 = xor i1 %t, -1
+  %z = call i32 @llvm.amdgcn.ballot.i32(i1 %t.1)
+  ret i32 %z
+}
 
-B:
-  %val_B = icmp ult i32 %tid, 2
-  br label %exit
+define amdgpu_ps i32 @negative_test_w32(i32 %x, i32 %y) {
+; GFX12W32-LABEL: negative_test_w32:
+; GFX12W32:       ; %bb.0:
+; GFX12W32-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12W32-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_or_b32 s0, s0, s1
+; GFX12W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12W32-NEXT:    s_xor_b32 s0, s0, -1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
+; GFX12W32-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX12W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12W32-NEXT:    s_add_co_i32 s0, s0, vcc_lo
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
+; GFX12W32-NEXT:    ; return to shader part epilog
+;
+; GFX11W32-LABEL: negative_test_w32:
+; GFX11W32:       ; %bb.0:
+; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    s_or_b32 s0, s0, s1
+; GFX11W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11W32-NEXT:    s_xor_b32 s0, s0, -1
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX11W32-NEXT:    s_add_i32 s0, s0, vcc_lo
+; GFX11W32-NEXT:    ; return to shader part epilog
+  %x.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %x)
+  %y.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %y)
+  %t = or i1 %x.b, %y.b
+  %t.1 = xor i1 %t, -1
+  %p.1 = xor i1 %t, -4
+  %z = call i32 @llvm.amdgcn.ballot.i32(i1 %t.1)
+  %q = call i32 @llvm.amdgcn.ballot.i32(i1 %p.1)
+  %r = add i32 %z, %q
+  ret i32 %r
+}
 
-exit:
-  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
-  %sel = select i1 %phi, i32 1, i32 2
-  store i32 %sel, ptr addrspace(1) %out
+define amdgpu_ps void @test_vgpr_w32(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %out) {
+; GFX12W32-LABEL: test_vgpr_w32:
+; GFX12W32:       ; %bb.0:
+; GFX12W32-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX12W32-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX12W32-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX12W32-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12W32-NEXT:    v_not_b32_e32 v3, v3
+; GFX12W32-NEXT:    v_not_b32_e32 v2, v2
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12W32-NEXT:    v_not_b32_e32 v1, v1
+; GFX12W32-NEXT:    v_not_b32_e32 v0, v0
+; GFX12W32-NEXT:    global_store_b128 v[8:9], v[0:3], off
+; GFX12W32-NEXT:    s_endpgm
+;
+; GFX11W32-LABEL: test_vgpr_w32:
+; GFX11W32:       ; %bb.0:
+; GFX11W32-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11W32-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX11W32-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11W32-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11W32-NEXT:    v_not_b32_e32 v3, v3
+; GFX11W32-NEXT:    v_not_b32_e32 v2, v2
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11W32-NEXT:    v_not_b32_e32 v1, v1
+; GFX11W32-NEXT:    v_not_b32_e32 v0, v0
+; GFX11W32-NEXT:    global_store_b128 v[8:9], v[0:3], off
+; GFX11W32-NEXT:    s_endpgm
+  %p = or <4 x i32> %x,  %y
+  %q = xor <4 x i32> %p, <i32 -1, i32 -1, i32 -1, i32 -1>
+  store <4 x i32> %q, ptr addrspace(1) %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll b/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
@@ -1,68 +1,108 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX12W64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX11W64 %s
 
-define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32  %tid, i32  %a, i32  %b, i32  %c, i32  %d) {
-; CHECK-LABEL: divergent_i1_phi_if_else:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], v3, v4
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; CHECK-NEXT:  ; %bb.1: ; %C
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; CHECK-NEXT:    s_and_not1_b64 s[4:5], s[0:1], exec
-; CHECK-NEXT:    s_and_b64 s[6:7], vcc, exec
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; CHECK-NEXT:  ; %bb.2: ; %MergeCF
-; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
-; CHECK-NEXT:    s_nor_b64 s[2:3], s[0:1], s[4:5]
-; CHECK-NEXT:    ; implicit-def: $sgpr0_sgpr1
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
-; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
-; CHECK-NEXT:  ; %bb.3: ; %B
-; CHECK-NEXT:    v_cmp_gt_u32_e64 s[0:1], 2, v2
-; CHECK-NEXT:    ; implicit-def: $vgpr2
-; CHECK-NEXT:  ; %bb.4: ; %Flow
-; CHECK-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
-; CHECK-NEXT:  ; %bb.5: ; %A
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT:    s_and_b64 s[4:5], vcc, exec
-; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; CHECK-NEXT:  ; %bb.6: ; %exit
-; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s[0:1]
-; CHECK-NEXT:    global_store_b32 v[0:1], v2, off
-; CHECK-NEXT:    s_endpgm
-entry:
-  %x = icmp ule i32 %a, %b
-  br i1 %x, label %C, label %MergeCF
 
-C:
-  %y = icmp eq i32 %a, %c
-  br label %MergeCF
-
-MergeCF:
-  %z = phi i1 [ %x, %entry ], [ %y, %C ]
-  %w = icmp ule i32 %a, %b
-  %cmp = or i1 %w, %z
-  br i1 %cmp, label %A, label %B
-
-A:
-  %val_A = icmp uge i32 %tid, 1
-  br label %exit
+define amdgpu_ps i64 @test_w64(i64 inreg %x, i64 inreg %y) {
+; GFX12W64-LABEL: test_w64:
+; GFX12W64:       ; %bb.0:
+; GFX12W64-NEXT:    s_nor_b64 s[0:1], s[0:1], s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX12W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GFX12W64-NEXT:    s_wait_alu 0xf1ff
+; GFX12W64-NEXT:    ; return to shader part epilog
+;
+; GFX11W64-LABEL: test_w64:
+; GFX11W64:       ; %bb.0:
+; GFX11W64-NEXT:    s_nor_b64 s[0:1], s[0:1], s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX11W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GFX11W64-NEXT:    ; return to shader part epilog
+  %x.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %x)
+  %y.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %y)
+  %t = or i1 %x.b, %y.b
+  %t.1 = xor i1 %t, -1
+  %z = call i64 @llvm.amdgcn.ballot.i64(i1 %t.1)
+  ret i64 %z
+}
 
-B:
-  %val_B = icmp ult i32 %tid, 2
-  br label %exit
+define amdgpu_ps i64 @negative_test_w64(i64 inreg %x, i64 inreg %y, ptr addrspace(1) %out) {
+; GFX12W64-LABEL: negative_test_w64:
+; GFX12W64:       ; %bb.0:
+; GFX12W64-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX12W64-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
+; GFX12W64-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX12W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX12W64-NEXT:    s_add_nc_u64 s[0:1], s[0:1], vcc
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
+; GFX12W64-NEXT:    ; return to shader part epilog
+;
+; GFX11W64-LABEL: negative_test_w64:
+; GFX11W64:       ; %bb.0:
+; GFX11W64-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX11W64-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX11W64-NEXT:    s_waitcnt_depctr 0xfffe
+; GFX11W64-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX11W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX11W64-NEXT:    s_add_u32 s0, vcc_lo, s0
+; GFX11W64-NEXT:    s_waitcnt_depctr 0xfffe
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    s_addc_u32 s1, vcc_hi, s1
+; GFX11W64-NEXT:    ; return to shader part epilog
+  %x.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %x)
+  %y.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %y)
+  %t = or i1 %x.b, %y.b
+  %t.1 = xor i1 %t, -1
+  %p.1 = xor i1 %t, -4
+  %z = call i64 @llvm.amdgcn.ballot.i64(i1 %t.1)
+  %q = call i64 @llvm.amdgcn.ballot.i64(i1 %p.1)
+  %r = add i64 %z, %q
+  ret i64 %r
+}
 
-exit:
-  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
-  %sel = select i1 %phi, i32 1, i32 2
-  store i32 %sel, ptr addrspace(1) %out
+define amdgpu_ps void @test_vgpr_w64(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %out) {
+; GFX12W64-LABEL: test_vgpr_w64:
+; GFX12W64:       ; %bb.0:
+; GFX12W64-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX12W64-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX12W64-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX12W64-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12W64-NEXT:    v_not_b32_e32 v3, v3
+; GFX12W64-NEXT:    v_not_b32_e32 v2, v2
+; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12W64-NEXT:    v_not_b32_e32 v1, v1
+; GFX12W64-NEXT:    v_not_b32_e32 v0, v0
+; GFX12W64-NEXT:    global_store_b128 v[8:9], v[0:3], off
+; GFX12W64-NEXT:    s_endpgm
+;
+; GFX11W64-LABEL: test_vgpr_w64:
+; GFX11W64:       ; %bb.0:
+; GFX11W64-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11W64-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX11W64-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11W64-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11W64-NEXT:    v_not_b32_e32 v3, v3
+; GFX11W64-NEXT:    v_not_b32_e32 v2, v2
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11W64-NEXT:    v_not_b32_e32 v1, v1
+; GFX11W64-NEXT:    v_not_b32_e32 v0, v0
+; GFX11W64-NEXT:    global_store_b128 v[8:9], v[0:3], off
+; GFX11W64-NEXT:    s_endpgm
+  %p = or <4 x i32> %x,  %y
+  %q = xor <4 x i32> %p, <i32 -1, i32 -1, i32 -1, i32 -1>
+  store <4 x i32> %q, ptr addrspace(1) %out
   ret void
 }