[llvm] [AMDGPU] Allocate i1 argument to SGPRs (PR #72461)

Jun Wang via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 6 14:51:45 PDT 2024


https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/72461

>From 75f1a46f910dd86edb465d6f3f6b4cf494baebaf Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 19:48:41 -0600
Subject: [PATCH 01/25] [AMDGPU] Allocate i1 argument to SGPRs

Currently i1 arguments are passed as 32-bit VGPRs. It would make more
sense to make use of SGPRs and pass these values as a wavesize bool mask.
---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td |  5 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   | 13 +++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      | 23 +++++++++
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp  |  6 +++
 llvm/test/CodeGen/AMDGPU/z_callee.ll        | 33 ++++++++++++
 llvm/test/CodeGen/AMDGPU/z_caller.ll        | 43 ++++++++++++++++
 llvm/test/CodeGen/AMDGPU/z_caller2.ll       | 57 +++++++++++++++++++++
 7 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 4be64629ddac8..faf82d412eb0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,9 +187,12 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
 // Calling convention for leaf functions
 def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
-  CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
+  CCIfType<[i1] , CCAssignToReg<
+    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
+  >>,
+
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0a3a56e9b3a0b..88e387e1df609 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3668,6 +3668,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
 
+  // In code below (after call of AnalyzeCallOperands),
+  // if (!Subtarget->enableFlatScratch()), it would use either s[48:51] or
+  // s[0:3]. Therefore, before calling AnalyzeCallOperands, we may need to
+  // reserve these registers.
+  if (!Subtarget->enableFlatScratch()) {
+    if (IsChainCallConv)
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+          AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+    else
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+          AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+  }
+
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 08351c49b2231..c0c093d3f4975 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -861,6 +861,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      // When calling convention allocates SGPR for i1 argument, we may
+      // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
+      // the copy to avoid illegal copy.
+      if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
+        auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
+        if (sub0 != DestReg)
+          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
+        return;
+      }
+
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -894,6 +904,19 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+      // When an i1 argument is allocated to an SGPR_32, we may have a COPY
+      // from SGPR_32 to SReg_64. The following handles this case to avoid
+      // an illegal copy.
+      if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+        auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
+        if (sub0 != SrcReg) {
+          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
+        }
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+        return;
+      }
+
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 32dad0c425c04..e4b95b66287fd 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -481,6 +481,12 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
+      // When the calling convention allocates i1 argument to SGPR,
+      // we may have a COPY with dst being an SGPR_32. This should
+      // not be lowered into V_CNDMASK_B32.
+      if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+        continue;
+
       Changed = true;
 
       // Copy into a 32-bit vector register.
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
new file mode 100644
index 0000000000000..2fc4befa279f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
+; Therefore, the "s_mov_b32 s5, 0" is generated.
+;
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s5, 0
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
new file mode 100644
index 0000000000000..faf25e407fca2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1(i1) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+  call void @external_void_func_i1(i1 true)
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
new file mode 100644
index 0000000000000..e63ae50b7e91c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1_signext(i1 signext) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT:    s_endpgm
+  %var = load volatile i1, ptr addrspace(1) undef
+  call void @external_void_func_i1_signext(i1 signext %var)
+  ret void
+}
+
+
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

>From 67365b80a1ffe5962699cbe6cd3e96d7bc05cd47 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 20:37:27 -0600
Subject: [PATCH 02/25] Fix format.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  | 11 +++++++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp     | 13 +++++++------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp |  2 +-
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 88e387e1df609..9fce3de9e02d2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3674,11 +3674,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   // reserve these registers.
   if (!Subtarget->enableFlatScratch()) {
     if (IsChainCallConv)
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
-          AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+      CCInfo.AllocateRegBlock(
+          ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,
+                              AMDGPU::SGPR51},
+          4);
     else
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
-          AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+                                                  AMDGPU::SGPR2, AMDGPU::SGPR3},
+                              4);
   }
 
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c0c093d3f4975..3d89e6bcd6f3a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -876,7 +876,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc));
+        .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
@@ -891,13 +891,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     if (DestReg == AMDGPU::VCC) {
       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addReg(SrcReg, getKillRegState(KillSrc));
       } else {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(KillSrc));
       }
 
       return;
@@ -907,13 +907,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       // When an i1 argument is allocated to an SGPR_32, we may have a COPY
       // from SGPR_32 to SReg_64. The following handles this case to avoid
       // an illegal copy.
-      if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+      if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
         auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
         if (sub0 != SrcReg) {
           BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
         }
         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+                RI.getSubReg(DestReg, AMDGPU::sub1))
+            .addImm(0);
         return;
       }
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index e4b95b66287fd..b2022714d9edc 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -484,7 +484,7 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
       // When the calling convention allocates i1 argument to SGPR,
       // we may have a COPY with dst being an SGPR_32. This should
       // not be lowered into V_CNDMASK_B32.
-      if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+      if (AMDGPU::SGPR_32RegClass.contains(DstReg))
         continue;
 
       Changed = true;

>From ae46c82f9561d50d85382db345e7eb627902cf14 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 12:31:17 -0600
Subject: [PATCH 03/25] Creating a custom calling conv function for i1.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |  9 +--
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 31 +++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  9 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 24 ------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    | 13 +--
 llvm/test/CodeGen/AMDGPU/z_callee.ll          |  7 +-
 llvm/test/CodeGen/AMDGPU/z_caller.ll          |  6 +-
 llvm/test/CodeGen/AMDGPU/z_caller2.ll         |  4 +-
 llvm/test/CodeGen/AMDGPU/z_return.ll          | 80 +++++++++++++++++++
 9 files changed, 137 insertions(+), 46 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index faf82d412eb0c..863d489be4e83 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -189,9 +189,7 @@ def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  CCIfType<[i1] , CCAssignToReg<
-    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
-  >>,
+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
 
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
@@ -207,8 +205,9 @@ def CC_AMDGPU_Func : CallingConv<[
 
 // Calling convention for leaf functions
 def RetCC_AMDGPU_Func : CallingConv<[
-  CCIfType<[i1], CCPromoteToType<i32>>,
-  CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d35a022ad6806..12c901ab2b45a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,6 +29,37 @@
 
 using namespace llvm;
 
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
+                           MVT LocVT, CCValAssign::LocInfo LocInfo,
+                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+
+  static const MCPhysReg I1RegList1[] = {
+    AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
+    AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
+    AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+    AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+    AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
+  };
+
+  static const MCPhysReg I1RegList2[] = {
+    AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
+    AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
+    AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
+    AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
+    AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
+    AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
+    AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
+  };
+
+  assert (LocVT == MVT::i1);
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+  return false; // not allocated
+}
+
 #include "AMDGPUGenCallingConv.inc"
 
 static cl::opt<bool> AMDGPUBypassSlowDiv(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9fce3de9e02d2..d18ce7ce4d0ca 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3026,8 +3026,13 @@ SDValue SITargetLowering::LowerFormalArguments(
       RC = &AMDGPU::VGPR_32RegClass;
     else if (AMDGPU::SGPR_32RegClass.contains(Reg))
       RC = &AMDGPU::SGPR_32RegClass;
-    else
-      llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+    else {
+      if (VT == MVT::i1 && Subtarget->isWave64())
+        RC = &AMDGPU::SGPR_64RegClass;
+      else
+        llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+    }
+
     EVT ValVT = VA.getValVT();
 
     Reg = MF.addLiveIn(Reg, RC);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3d89e6bcd6f3a..3db884b78e007 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -861,16 +861,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
-      // When calling convention allocates SGPR for i1 argument, we may
-      // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
-      // the copy to avoid illegal copy.
-      if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
-        auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
-        if (sub0 != DestReg)
-          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
-        return;
-      }
-
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -904,20 +894,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
-      // When an i1 argument is allocated to an SGPR_32, we may have a COPY
-      // from SGPR_32 to SReg_64. The following handles this case to avoid
-      // an illegal copy.
-      if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
-        auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
-        if (sub0 != SrcReg) {
-          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
-        }
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                RI.getSubReg(DestReg, AMDGPU::sub1))
-            .addImm(0);
-        return;
-      }
-
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index b2022714d9edc..00d3eabc1afc0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -481,12 +481,6 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
-      // When the calling convention allocates i1 argument to SGPR,
-      // we may have a COPY with dst being an SGPR_32. This should
-      // not be lowered into V_CNDMASK_B32.
-      if (AMDGPU::SGPR_32RegClass.contains(DstReg))
-        continue;
-
       Changed = true;
 
       // Copy into a 32-bit vector register.
@@ -695,6 +689,13 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
       assert(!MI.getOperand(1).getSubReg());
 
       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
+        if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
+          // return value is put in 64b SGPR.
+          assert(ST->isWave64());
+          continue;
+        }
+
         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
         Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
index 2fc4befa279f3..44af2c90f900b 100644
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 define void @void_func_i1(i1 %arg0) #0 {
 ; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
@@ -11,7 +11,6 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s5, 0
 ; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
index faf25e407fca2..f9203cf078e47 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 
 declare hidden void @external_void_func_i1(i1) #0
@@ -17,7 +17,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
index e63ae50b7e91c..1141476960250 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 
 declare hidden void @external_void_func_i1_signext(i1 signext) #0
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
new file mode 100644
index 0000000000000..6bf64da7a1b8f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_return.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define i1 @i1_func_void() #0 {
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_i1_func_void() #0 {
+; CIGFX89-LABEL: test_call_i1_func_void:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s6, s33
+; CIGFX89-NEXT:    s_mov_b32 s33, s32
+; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT:    s_addk_i32 s32, 0x400
+; CIGFX89-NEXT:    s_getpc_b64 s[4:5]
+; CIGFX89-NEXT:    s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
+; CIGFX89-NEXT:    s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
+; CIGFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIGFX89-NEXT:    v_writelane_b32 v1, s30, 0
+; CIGFX89-NEXT:    v_writelane_b32 v1, s31, 1
+; CIGFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; CIGFX89-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CIGFX89-NEXT:    global_store_byte v[2:3], v0, off
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    v_readlane_b32 s31, v1, 1
+; CIGFX89-NEXT:    v_readlane_b32 s30, v1, 0
+; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT:    buffer_load_dword v1, off, s[0:3], s33  ; 4-byte Folded Reload
+; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT:    s_addk_i32 s32, 0xfc00
+; CIGFX89-NEXT:    s_mov_b32 s33, s6
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v1, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v1, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    global_store_b8 v[2:3], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v1, off, s33           ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+
+  %val = call i1 @i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+

>From 721c34d2cc19d054fc857fcf2ab568554fd5381f Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 20:04:19 -0600
Subject: [PATCH 04/25] Fix formatting.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 44 ++++++++++---------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    |  7 +--
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 12c901ab2b45a..6f8aa496f0120 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,31 +29,33 @@
 
 using namespace llvm;
 
-static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
-                           MVT LocVT, CCValAssign::LocInfo LocInfo,
-                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                CCValAssign::LocInfo LocInfo,
+                                ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static bool IsWave64 = static_cast<const GCNSubtarget &>(
+                             State.getMachineFunction().getSubtarget())
+                             .isWave64();
 
   static const MCPhysReg I1RegList1[] = {
-    AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
-    AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
-    AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
-    AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
-    AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
-  };
+      AMDGPU::SGPR0_SGPR1,   AMDGPU::SGPR2_SGPR3,   AMDGPU::SGPR4_SGPR5,
+      AMDGPU::SGPR6_SGPR7,   AMDGPU::SGPR8_SGPR9,   AMDGPU::SGPR10_SGPR11,
+      AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+      AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+      AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
 
   static const MCPhysReg I1RegList2[] = {
-    AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
-    AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
-    AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
-    AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
-    AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
-    AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
-    AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
-  };
-
-  assert (LocVT == MVT::i1);
-  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+      AMDGPU::SGPR0,  AMDGPU::SGPR1,  AMDGPU::SGPR2,  AMDGPU::SGPR3,
+      AMDGPU::SGPR4,  AMDGPU::SGPR5,  AMDGPU::SGPR6,  AMDGPU::SGPR7,
+      AMDGPU::SGPR8,  AMDGPU::SGPR9,  AMDGPU::SGPR10, AMDGPU::SGPR11,
+      AMDGPU::SGPR12, AMDGPU::SGPR13, AMDGPU::SGPR14, AMDGPU::SGPR15,
+      AMDGPU::SGPR16, AMDGPU::SGPR17, AMDGPU::SGPR18, AMDGPU::SGPR19,
+      AMDGPU::SGPR20, AMDGPU::SGPR21, AMDGPU::SGPR22, AMDGPU::SGPR23,
+      AMDGPU::SGPR24, AMDGPU::SGPR25, AMDGPU::SGPR26, AMDGPU::SGPR27,
+      AMDGPU::SGPR28, AMDGPU::SGPR29};
+
+  assert(LocVT == MVT::i1);
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
+                              : State.AllocateReg(I1RegList2)) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 00d3eabc1afc0..a04ce16cbddb6 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -689,9 +689,10 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
       assert(!MI.getOperand(1).getSubReg());
 
       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
-        if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
-          // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
-          // return value is put in 64b SGPR.
+        if (!SrcReg.isVirtual() &&
+            TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with
+          // wavefront size 64, i1 return value is put in 64b SGPR.
           assert(ST->isWave64());
           continue;
         }

>From ca09dddea97071d08b9ad2a84d5a52a079a60f38 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 21 Dec 2023 16:13:47 -0600
Subject: [PATCH 05/25] Fixed (1) problems for global-isel wrt both incoming
 args and return value (2) a problem in AMDCallingConv.td when no sgprs are
 available.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  20 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  13 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   2 +-
 .../irtranslator-call-return-values.ll        |  20 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |  42 +-
 .../GlobalISel/irtranslator-function-args.ll  | 243 ++++++++++--
 .../GlobalISel/irtranslator-invariant.ll      |   6 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |  48 +--
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |   6 +-
 ...amdgpu-codegenprepare-fold-binop-select.ll | 278 ++++++-------
 llvm/test/CodeGen/AMDGPU/function-args.ll     | 371 +++++++++++++++---
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |   5 +
 llvm/test/CodeGen/AMDGPU/z_callee.ll          |  32 --
 llvm/test/CodeGen/AMDGPU/z_caller.ll          |  43 --
 llvm/test/CodeGen/AMDGPU/z_caller2.ll         |  57 ---
 llvm/test/CodeGen/AMDGPU/z_return.ll          |  80 ----
 17 files changed, 754 insertions(+), 514 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7e1f041fa1093..5e1b551a853eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -124,7 +124,15 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
     if (VA.getLocVT().getSizeInBits() < 32) {
       // 16-bit types are reported as legal for 32-bit registers. We need to do
       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
-      auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+      unsigned CopyToBits = 32;
+
+      // When function return type is i1, it may be in a 64b register.
+      if (VA.getLocVT().getSizeInBits() == 1) {
+        if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
+          CopyToBits = 64;
+      }
+
+      auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
 
       // If we have signext/zeroext, it applies to the whole 32-bit register
       // before truncation.
@@ -233,7 +241,15 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
-    Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+    Register ExtReg;
+
+    if (VA.getLocVT().getSizeInBits() == 1 &&
+        MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+      ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
+    } else {
+      ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+    }
+
     MIRBuilder.buildCopy(PhysReg, ExtReg);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 863d489be4e83..0a197e4a786cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -191,6 +191,8 @@ def CC_AMDGPU_Func : CallingConv<[
 
   CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
 
+  CCIfType<[i1], CCPromoteToType<i32>>,
+
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 6f8aa496f0120..02cb248836df1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -32,18 +32,17 @@ using namespace llvm;
 static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
                                 CCValAssign::LocInfo LocInfo,
                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static bool IsWave64 = static_cast<const GCNSubtarget &>(
-                             State.getMachineFunction().getSubtarget())
-                             .isWave64();
+  static bool IsWave64 =
+      State.getMachineFunction().getSubtarget<GCNSubtarget>().isWave64();
 
-  static const MCPhysReg I1RegList1[] = {
+  static const MCPhysReg SGPRArgsWave64[] = {
       AMDGPU::SGPR0_SGPR1,   AMDGPU::SGPR2_SGPR3,   AMDGPU::SGPR4_SGPR5,
       AMDGPU::SGPR6_SGPR7,   AMDGPU::SGPR8_SGPR9,   AMDGPU::SGPR10_SGPR11,
       AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
       AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
       AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
 
-  static const MCPhysReg I1RegList2[] = {
+  static const MCPhysReg SGPRArgsWave32[] = {
       AMDGPU::SGPR0,  AMDGPU::SGPR1,  AMDGPU::SGPR2,  AMDGPU::SGPR3,
       AMDGPU::SGPR4,  AMDGPU::SGPR5,  AMDGPU::SGPR6,  AMDGPU::SGPR7,
       AMDGPU::SGPR8,  AMDGPU::SGPR9,  AMDGPU::SGPR10, AMDGPU::SGPR11,
@@ -54,8 +53,8 @@ static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
       AMDGPU::SGPR28, AMDGPU::SGPR29};
 
   assert(LocVT == MVT::i1);
-  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
-                              : State.AllocateReg(I1RegList2)) {
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64)
+                              : State.AllocateReg(SGPRArgsWave32)) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d18ce7ce4d0ca..297d38385852f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3028,7 +3028,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       RC = &AMDGPU::SGPR_32RegClass;
     else {
       if (VT == MVT::i1 && Subtarget->isWave64())
-        RC = &AMDGPU::SGPR_64RegClass;
+        RC = Subtarget->getBoolRC();
       else
         llvm_unreachable("Unexpected register class in LowerFormalArguments!");
     }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 37f2118572d84..3db0acceec0b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -198,9 +198,9 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -275,10 +275,9 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
   ; GCN-NEXT:   G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
@@ -336,10 +335,9 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
   ; GCN-NEXT:   G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 392b0ae6823e4..e546144ce3373 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -368,12 +368,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
+  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT]](s64)
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -381,7 +381,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   call void @external_void_func_i1(i1 true)
@@ -426,12 +426,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[SEXT]](s64)
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -439,7 +439,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   %var = load volatile i1, ptr addrspace(1) undef
@@ -485,12 +485,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
+  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -498,7 +498,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   %var = load volatile i1, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 6d32d4c720c99..2c8f22ed57ab2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -3,6 +3,7 @@
 ; the frame info, so some functions have manually added stack object
 ; checks.
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=GFX1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
 ; FIXME: pre-VI should have same ABI without legal i16 operations.
 
 define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
@@ -34,10 +35,10 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
 define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -48,11 +49,10 @@ define void @void_func_i1(i1 %arg0) #0 {
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 1
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
@@ -68,11 +68,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 1
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
@@ -89,10 +88,10 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-LABEL: name: i1_arg_i1_use
   ; CHECK: bb.1.bb:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -1986,25 +1985,25 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28
   ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
-  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5)
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
-  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s1) from %fixed-stack.3, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
-  ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
-  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.2, align 8, addrspace 5)
-  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD2]](s16)
-  ; CHECK-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
-  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.1, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
-  ; CHECK-NEXT:   [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
+  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
+  ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.1, align 8, addrspace 5)
+  ; CHECK-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
   ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[LOAD2]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD4]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -3230,6 +3229,196 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
   store <2 x ptr addrspace(3)> %arg0, ptr addrspace(1) undef
+; Check calling convention for i1 args
+define void @many_i1_args(
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; CHECK-LABEL: name: many_i1_args
+; CHECK: bb.1 (%ir-block.0):
+; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
+; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
+; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; CHECK-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
+; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
+; CHECK-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
+; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+; CHECK-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
+; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
+; CHECK-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
+; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
+; CHECK-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
+; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
+; CHECK-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
+; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
+; CHECK-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
+; CHECK-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s64)
+; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
+; CHECK-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s64)
+; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; CHECK-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr8
+; CHECK-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; CHECK-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr9
+; CHECK-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; CHECK-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr10
+; CHECK-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; CHECK-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr11
+; CHECK-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; CHECK-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr12
+; CHECK-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr13
+; CHECK-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr14
+; CHECK-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr15
+; CHECK-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr16
+; CHECK-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; CHECK:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+;
+; GFX11-LABEL: name: many_i1_args
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i1 %arg2, ptr addrspace(1) undef
+  store volatile i1 %arg3, ptr addrspace(1) undef
+  store volatile i1 %arg4, ptr addrspace(1) undef
+  store volatile i1 %arg5, ptr addrspace(1) undef
+  store volatile i1 %arg6, ptr addrspace(1) undef
+  store volatile i1 %arg7, ptr addrspace(1) undef
+
+  store volatile i1 %arg8, ptr addrspace(1) undef
+  store volatile i1 %arg9, ptr addrspace(1) undef
+  store volatile i1 %arg10, ptr addrspace(1) undef
+  store volatile i1 %arg11, ptr addrspace(1) undef
+  store volatile i1 %arg12, ptr addrspace(1) undef
+  store volatile i1 %arg13, ptr addrspace(1) undef
+  store volatile i1 %arg14, ptr addrspace(1) undef
+  store volatile i1 %arg15, ptr addrspace(1) undef
+
+  store volatile i1 %arg16, ptr addrspace(1) undef
+  store volatile i1 %arg17, ptr addrspace(1) undef
+  store volatile i1 %arg18, ptr addrspace(1) undef
+  store volatile i1 %arg19, ptr addrspace(1) undef
+  store volatile i1 %arg20, ptr addrspace(1) undef
+  store volatile i1 %arg21, ptr addrspace(1) undef
+  store volatile i1 %arg22, ptr addrspace(1) undef
+  store volatile i1 %arg23, ptr addrspace(1) undef
+
+  store volatile i1 %arg24, ptr addrspace(1) undef
+  store volatile i1 %arg25, ptr addrspace(1) undef
+  store volatile i1 %arg26, ptr addrspace(1) undef
+  store volatile i1 %arg27, ptr addrspace(1) undef
+  store volatile i1 %arg28, ptr addrspace(1) undef
+  store volatile i1 %arg29, ptr addrspace(1) undef
+  store volatile i1 %arg30, ptr addrspace(1) undef
+  store volatile i1 %arg31, ptr addrspace(1) undef
+
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index ec07b0b1d4f45..ac1eb4e2adda0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -22,10 +22,10 @@ define i32 @load_const_i32_gv() {
 define i32 @load_select_const_i32_gv(i1 %cond) {
   ; CHECK-LABEL: name: load_select_const_i32_gv
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
   ; CHECK-NEXT:   [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
   ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[TRUNC]](s1), [[GV]], [[GV1]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 4caf83774bbba..979590fd11688 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -11,8 +11,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -20,8 +20,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -29,32 +29,32 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f32:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W64-LABEL: v_div_fmas_f32:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W32-LABEL: v_div_fmas_f32:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W64-LABEL: v_div_fmas_f32:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX11_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W64-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
@@ -65,8 +65,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -74,8 +74,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -83,32 +83,32 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W64-LABEL: v_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W32-LABEL: v_div_fmas_f64:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX11_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W64-LABEL: v_div_fmas_f64:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX11_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX11_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11_W64-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 36bac87889cac..1cff9ba4d2340 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,9 +168,9 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-LABEL: localize_internal_globals:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    s_and_b32 s4, 1, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 5c40a4ce13e31..9beec51710598 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -10,11 +10,10 @@ define i32 @select_sdiv_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1e848
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x30d40
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0x30d40
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1e848
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 8
   %op = sdiv i32 1000000, %select
@@ -29,11 +28,10 @@ define i32 @select_sdiv_rhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x2710
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x3e8
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_movk_i32 s6, 0x3e8
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x2710
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 42000, i32 420000
   %op = sdiv i32 %select, 42
@@ -48,11 +46,10 @@ define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_const_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x22b
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x29a
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_movk_i32 s6, 0x29a
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x22b
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x594
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, <2 x i32> <i32 5, i32 undef>, <2 x i32> <i32 6, i32 7>
@@ -68,14 +65,13 @@ define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_const_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3661c
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x307dd
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x23b02a
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x13e3a0c
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_mov_b32 s6, 0x307dd
+; GCN-NEXT:    s_mov_b32 s5, 0x13e3a0c
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x3661c
+; GCN-NEXT:    s_cselect_b32 s5, s5, 0x23b02a
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, <2 x i32> <i32 8342123, i32 834212353>, <2 x i32> <i32 9355456, i32 93554321>
   %op = sdiv <2 x i32> %select, <i32 42, i32 40>
@@ -126,40 +122,41 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_opaque_const0_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v1, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_cselect_b32 s4, s6, 5
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_sub_i32 s6, 0, s4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT:    s_mov_b32 s6, 0xf4240
+; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT:    s_mov_b32 s4, 0xf4240
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    s_mul_i32 s6, s6, s4
+; GCN-NEXT:    s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT:    s_sub_i32 s7, s6, s4
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s6, s7, s6
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, s5, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
   %op = sdiv i32 1000000, %select
@@ -208,40 +205,41 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_opaque_const1_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 5, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_cselect_b32 s4, 5, s6
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_sub_i32 s6, 0, s4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT:    s_mov_b32 s6, 0xf4240
+; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT:    s_mov_b32 s4, 0xf4240
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    s_mul_i32 s6, s6, s4
+; GCN-NEXT:    s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT:    s_sub_i32 s7, s6, s4
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s6, s7, s6
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, s5, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
   %op = sdiv i32 1000000, %select
@@ -257,18 +255,15 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_opaque_const0_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x392fa
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x30c30c31
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT:    v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x392fa
+; GCN-NEXT:    v_mul_hi_i32 v0, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -287,18 +282,15 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_opaque_const1_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xa410
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x30c30c31
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT:    v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT:    s_cselect_b32 s4, 0xa410, s6
+; GCN-NEXT:    v_mul_hi_i32 v0, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -316,11 +308,10 @@ define i32 @select_add_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xf4248
-; GCN-NEXT:    v_mov_b32_e32 v2, 0xf4245
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0xf4245
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0xf4248
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 8
   %op = add i32 1000000, %select
@@ -335,11 +326,9 @@ define float @select_fadd_lhs_const_i32_fmf(i1 %cond) {
 ; GCN-LABEL: select_fadd_lhs_const_i32_fmf:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x40a00000
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x40400000
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40a00000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, float 2.0, float 4.0
   %op = fadd nnan nsz float 1.0, %select
@@ -351,12 +340,10 @@ define i32 @select_mul_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_mul_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_movk_i32 s6, 0x1388
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; IR-LABEL: @select_mul_lhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
@@ -370,12 +357,10 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_mul_rhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_movk_i32 s6, 0x1388
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; IR-LABEL: @select_mul_rhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
@@ -411,9 +396,7 @@ define i16 @select_add_trunc_select(i1 %cond) {
 ; GCN-LABEL: select_add_trunc_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; IR-LABEL: @select_add_trunc_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
@@ -432,9 +415,9 @@ define i32 @select_add_sext_select(i1 %cond) {
 ; GCN-LABEL: select_add_sext_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 29, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, 29, 50
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 -13, i16 8
   %trunc = sext i16 %select to i32
@@ -450,9 +433,9 @@ define i32 @select_add_zext_select(i1 %cond) {
 ; GCN-LABEL: select_add_zext_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, 47, 50
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 5, i16 8
   %trunc = zext i16 %select to i32
@@ -468,11 +451,10 @@ define i32 @select_add_bitcast_select(i1 %cond) {
 ; GCN-LABEL: select_add_bitcast_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000002a
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x3f80002a
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0x3f80002a
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x4000002a
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, float 1.0, float 2.0
   %trunc = bitcast float %select to i32
@@ -493,10 +475,8 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
 ; GCN-LABEL: multi_use_cast_regression:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c00
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_sub_f32_e32 v1, 1.0, v0
 ; GCN-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 3b2f15c8340a6..9fca84ef2667c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -8,7 +8,7 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -18,7 +18,7 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; GFX11-LABEL: void_func_i1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -31,6 +31,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1_zeroext:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    v_or_b32_e32 v0, 12, v0
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
@@ -41,9 +42,11 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 ; GFX11-LABEL: void_func_i1_zeroext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i1 %arg0 to i32
@@ -56,7 +59,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; CI-LABEL: void_func_i1_signext:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, 12, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -66,7 +70,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; VI-LABEL: void_func_i1_signext:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, 12, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -76,7 +81,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; GFX9-LABEL: void_func_i1_signext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v0, 12, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -86,9 +92,11 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; GFX11-LABEL: void_func_i1_signext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 12, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i1 %arg0 to i32
@@ -101,9 +109,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-LABEL: i1_arg_i1_use:
 ; CIGFX89:       ; %bb.0: ; %bb
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
-; CIGFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CIGFX89-NEXT:    s_xor_b64 s[6:7], vcc, -1
+; CIGFX89-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; CIGFX89-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
 ; CIGFX89-NEXT:    s_cbranch_execz .LBB3_2
 ; CIGFX89-NEXT:  ; %bb.1: ; %bb1
@@ -119,11 +125,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; GFX11-LABEL: i1_arg_i1_use:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT:    s_xor_b32 s1, s0, -1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s1
 ; GFX11-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb1
@@ -2774,13 +2778,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2789,15 +2791,15 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; CI-NEXT:    v_and_b32_e32 v0, 1, v17
-; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2818,13 +2820,12 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2833,14 +2834,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, 1, v20
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2859,15 +2859,12 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2876,14 +2873,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2892,16 +2888,15 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -2910,8 +2905,6 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
 ; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2920,7 +2913,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
@@ -4634,7 +4627,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
   ret void
 }
 
-
 define void @void_func_bf16(bfloat %arg0) #0 {
 ; CI-LABEL: void_func_bf16:
 ; CI:       ; %bb.0:
@@ -4891,4 +4883,275 @@ define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 {
   ret void
 }
 
+define void @many_i1_args(
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: many_i1_args:
+; GFX9:      ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, vcc
+; GFX9-NEXT:    v_writelane_b32 v19, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v19, s31, 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s31, 0xf000
+; GFX9-NEXT:    s_mov_b32 s30, -1
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[10:11]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[12:13]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[14:15]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[16:17]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[18:19]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[20:21]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[22:23]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[24:25]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[26:27]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[28:29]
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v19, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v19, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: many_i1_args:
+; GFX11:      ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s32          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT:    s_mov_b32 s30, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_mov_b32 s31, 0x31016000
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s11
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s16
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s21
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s26
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s29
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32           ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i1 %arg2, ptr addrspace(1) undef
+  store volatile i1 %arg3, ptr addrspace(1) undef
+  store volatile i1 %arg4, ptr addrspace(1) undef
+  store volatile i1 %arg5, ptr addrspace(1) undef
+  store volatile i1 %arg6, ptr addrspace(1) undef
+  store volatile i1 %arg7, ptr addrspace(1) undef
+
+  store volatile i1 %arg8, ptr addrspace(1) undef
+  store volatile i1 %arg9, ptr addrspace(1) undef
+  store volatile i1 %arg10, ptr addrspace(1) undef
+  store volatile i1 %arg11, ptr addrspace(1) undef
+  store volatile i1 %arg12, ptr addrspace(1) undef
+  store volatile i1 %arg13, ptr addrspace(1) undef
+  store volatile i1 %arg14, ptr addrspace(1) undef
+  store volatile i1 %arg15, ptr addrspace(1) undef
+
+  store volatile i1 %arg16, ptr addrspace(1) undef
+  store volatile i1 %arg17, ptr addrspace(1) undef
+  store volatile i1 %arg18, ptr addrspace(1) undef
+  store volatile i1 %arg19, ptr addrspace(1) undef
+  store volatile i1 %arg20, ptr addrspace(1) undef
+  store volatile i1 %arg21, ptr addrspace(1) undef
+  store volatile i1 %arg22, ptr addrspace(1) undef
+  store volatile i1 %arg23, ptr addrspace(1) undef
+
+  store volatile i1 %arg24, ptr addrspace(1) undef
+  store volatile i1 %arg25, ptr addrspace(1) undef
+  store volatile i1 %arg26, ptr addrspace(1) undef
+  store volatile i1 %arg27, ptr addrspace(1) undef
+  store volatile i1 %arg28, ptr addrspace(1) undef
+  store volatile i1 %arg29, ptr addrspace(1) undef
+  store volatile i1 %arg30, ptr addrspace(1) undef
+  store volatile i1 %arg31, ptr addrspace(1) undef
+
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 401cbce00ac9a..df2163c4f9578 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -12,6 +12,8 @@ define i1 @i1_func_void() #0 {
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_func_void:
@@ -21,6 +23,9 @@ define i1 @i1_func_void() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
deleted file mode 100644
index 44af2c90f900b..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define void @void_func_i1(i1 %arg0) #0 {
-; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
-; Therefore, the "s_mov_b32 s5, 0" is generated.
-;
-; CIGFX89-LABEL: void_func_i1:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: void_func_i1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i1 %arg0, ptr addrspace(1) undef
-  ret void
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
deleted file mode 100644
index f9203cf078e47..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1(i1) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b64 s[4:5], -1
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_imm:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s0, -1
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_endpgm
-  call void @external_void_func_i1(i1 true)
-  ret void
-}
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
deleted file mode 100644
index 1141476960250..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1_signext(i1 signext) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_signext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_signext:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GFX11-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11-NEXT:    s_mov_b32 s0, s2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX11-NEXT:    s_endpgm
-  %var = load volatile i1, ptr addrspace(1) undef
-  call void @external_void_func_i1_signext(i1 signext %var)
-  ret void
-}
-
-
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
deleted file mode 100644
index 6bf64da7a1b8f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_return.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define i1 @i1_func_void() #0 {
-  %val = load i1, ptr addrspace(1) undef
-  ret i1 %val
-}
-
-define void @test_call_i1_func_void() #0 {
-; CIGFX89-LABEL: test_call_i1_func_void:
-; CIGFX89: ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s6, s33
-; CIGFX89-NEXT:    s_mov_b32 s33, s32
-; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
-; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT:    s_addk_i32 s32, 0x400
-; CIGFX89-NEXT:    s_getpc_b64 s[4:5]
-; CIGFX89-NEXT:    s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
-; CIGFX89-NEXT:    s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
-; CIGFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CIGFX89-NEXT:    v_writelane_b32 v1, s30, 0
-; CIGFX89-NEXT:    v_writelane_b32 v1, s31, 1
-; CIGFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; CIGFX89-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CIGFX89-NEXT:    global_store_byte v[2:3], v0, off
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    v_readlane_b32 s31, v1, 1
-; CIGFX89-NEXT:    v_readlane_b32 s30, v1, 0
-; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT:    buffer_load_dword v1, off, s[0:3], s33  ; 4-byte Folded Reload
-; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT:    s_addk_i32 s32, 0xfc00
-; CIGFX89-NEXT:    s_mov_b32 s33, s6
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_i1_func_void:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v1, s33          ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    global_store_b8 v[2:3], v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v1, off, s33           ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-
-  %val = call i1 @i1_func_void()
-  store volatile i1 %val, ptr addrspace(1) undef
-  ret void
-}
-
-attributes #0 = { nounwind }
-
-

>From f26afca5d23d1ad9bf02883cbd2ccfb97414457b Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Tue, 16 Jan 2024 16:22:20 -0600
Subject: [PATCH 06/25] Minor changes based on code review.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 11 +++++------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5e1b551a853eb..94c62f8ddc0e1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -127,7 +127,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
       unsigned CopyToBits = 32;
 
       // When function return type is i1, it may be in a 64b register.
-      if (VA.getLocVT().getSizeInBits() == 1) {
+      if (VA.getLocVT() == MVT::i1) {
         if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
           CopyToBits = 64;
       }
@@ -241,15 +241,14 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
-    Register ExtReg;
 
-    if (VA.getLocVT().getSizeInBits() == 1 &&
+    if (VA.getLocVT() == MVT::i1 &&
         MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
-      ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
-    } else {
-      ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+      MIRBuilder.buildCopy(PhysReg, ValVReg);
+      return;
     }
 
+    Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 297d38385852f..54797b29d8965 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3027,7 +3027,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     else if (AMDGPU::SGPR_32RegClass.contains(Reg))
       RC = &AMDGPU::SGPR_32RegClass;
     else {
-      if (VT == MVT::i1 && Subtarget->isWave64())
+      if (VT == MVT::i1)
         RC = Subtarget->getBoolRC();
       else
         llvm_unreachable("Unexpected register class in LowerFormalArguments!");

>From 26fa9cc68172db8d26d13427ebddca5c16355e8a Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 22 Jan 2024 16:23:43 -0600
Subject: [PATCH 07/25] Additional change based on code review.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 94c62f8ddc0e1..53dbae7765803 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -128,7 +128,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
 
       // When function return type is i1, it may be in a 64b register.
       if (VA.getLocVT() == MVT::i1) {
-        if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
+        if (MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
           CopyToBits = 64;
       }
 
@@ -243,7 +243,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
     MIB.addUse(PhysReg, RegState::Implicit);
 
     if (VA.getLocVT() == MVT::i1 &&
-        MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+        MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
       MIRBuilder.buildCopy(PhysReg, ValVReg);
       return;
     }

>From 3b323d98e74d89ada3cf9c1338ef9ef89a62e84d Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 31 Jan 2024 12:54:51 -0600
Subject: [PATCH 08/25] Changing a vector of 4 registers to a single register.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 54797b29d8965..0fdb3c4e36c67 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3679,14 +3679,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   // reserve these registers.
   if (!Subtarget->enableFlatScratch()) {
     if (IsChainCallConv)
-      CCInfo.AllocateRegBlock(
-          ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,
-                              AMDGPU::SGPR51},
-          4);
+      CCInfo.AllocateReg(AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51);
     else
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
-                                                  AMDGPU::SGPR2, AMDGPU::SGPR3},
-                              4);
+      CCInfo.AllocateReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);
   }
 
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);

>From b4c0bb9e5e5d8fcbbe861682edd910db48266189 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 2 Feb 2024 16:07:43 -0600
Subject: [PATCH 09/25] Update some test files.

---
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |   21 +-
 .../GlobalISel/irtranslator-function-args.ll  |  173 +-
 .../GlobalISel/irtranslator-invariant.ll      |    4 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |   16 +-
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |    2 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 1923 ++++++-----------
 llvm/test/CodeGen/AMDGPU/call-args-inreg.ll   |  325 ++-
 .../CodeGen/AMDGPU/call-argument-types.ll     |  155 +-
 .../CodeGen/AMDGPU/combine_andor_with_cmps.ll |  474 ++--
 .../dagcombine-v1i8-extractvecelt-crash.ll    |   13 +-
 .../AMDGPU/divergence-driven-trunc-to-i1.ll   |   42 +-
 llvm/test/CodeGen/AMDGPU/extract-load-i1.ll   |    2 +
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |  130 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |  344 +--
 14 files changed, 1395 insertions(+), 2229 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index e546144ce3373..d0a17bc48c185 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -368,12 +368,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT]](s64)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[C]](s1)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -426,12 +425,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[SEXT]](s64)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -485,12 +483,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 2c8f22ed57ab2..d239b7271dd89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -35,9 +35,9 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
 define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -49,9 +49,9 @@ define void @void_func_i1(i1 %arg0) #0 {
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -68,9 +68,9 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -88,9 +88,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-LABEL: name: i1_arg_i1_use
   ; CHECK: bb.1.bb:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -1988,7 +1988,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
@@ -2781,8 +2781,8 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -3229,6 +3229,9 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
   store <2 x ptr addrspace(3)> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
 ; Check calling convention for i1 args
 define void @many_i1_args(
   i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
@@ -3237,71 +3240,71 @@ define void @many_i1_args(
   i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
 ; CHECK-LABEL: name: many_i1_args
 ; CHECK: bb.1 (%ir-block.0):
-; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
 ; CHECK-NEXT: {{  $}}
-; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
 ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
 ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
-; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
 ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
 ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
-; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
 ; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
-; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
 ; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
-; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
 ; CHECK-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
-; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
-; CHECK-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
-; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-; CHECK-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
-; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
-; CHECK-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
-; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
-; CHECK-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
-; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
-; CHECK-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
-; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
-; CHECK-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
-; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
-; CHECK-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s64)
-; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
-; CHECK-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s64)
-; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr8
 ; CHECK-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr9
 ; CHECK-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr10
 ; CHECK-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr11
 ; CHECK-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr12
 ; CHECK-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr13
 ; CHECK-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr14
 ; CHECK-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr15
 ; CHECK-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; CHECK-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr8
+; CHECK-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr16
 ; CHECK-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; CHECK-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr9
+; CHECK-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr17
 ; CHECK-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; CHECK-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr10
+; CHECK-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr18
 ; CHECK-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; CHECK-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr11
+; CHECK-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr19
 ; CHECK-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; CHECK-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr12
+; CHECK-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr20
 ; CHECK-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr13
+; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr21
 ; CHECK-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr14
+; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr22
 ; CHECK-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr15
+; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr23
 ; CHECK-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr16
+; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr24
 ; CHECK-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
 ;
 ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -3311,71 +3314,71 @@ define void @many_i1_args(
 ;
 ; GFX11-LABEL: name: many_i1_args
 ; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
 ; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr17
 ; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
-; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr18
 ; GFX11-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
-; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr19
 ; GFX11-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
-; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr20
 ; GFX11-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
-; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr21
 ; GFX11-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
-; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr22
 ; GFX11-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
-; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr23
 ; GFX11-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr24
 ; GFX11-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr25
 ; GFX11-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr26
 ; GFX11-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr27
 ; GFX11-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr28
 ; GFX11-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr29
 ; GFX11-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; GFX11-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr1
 ; GFX11-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr2
 ; GFX11-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr3
 ; GFX11-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr4
 ; GFX11-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr5
 ; GFX11-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr6
 ; GFX11-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr7
 ; GFX11-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr8
 ; GFX11-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr9
 ; GFX11-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr10
 ; GFX11-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr11
 ; GFX11-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr12
 ; GFX11-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr13
 ; GFX11-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr14
 ; GFX11-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr15
 ; GFX11-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr16
 ; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr17
 ; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
 ;
 ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index ac1eb4e2adda0..6360c5c2cbb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -22,9 +22,9 @@ define i32 @load_const_i32_gv() {
 define i32 @load_select_const_i32_gv(i1 %cond) {
   ; CHECK-LABEL: name: load_select_const_i32_gv
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
   ; CHECK-NEXT:   [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 979590fd11688..44014f2546814 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -11,7 +11,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    s_and_b32 s4, 1, s4
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
@@ -20,7 +20,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
@@ -29,7 +29,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f32:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
@@ -37,7 +37,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX10_W64-LABEL: v_div_fmas_f32:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
@@ -65,7 +65,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    s_and_b32 s4, 1, s4
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
@@ -74,7 +74,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
@@ -83,7 +83,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
@@ -91,7 +91,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX10_W64-LABEL: v_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 1cff9ba4d2340..4d04d6b7570c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,7 +168,7 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-LABEL: localize_internal_globals:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, 1, s0
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index a86a3f6f279d7..a8a6f1954edd1 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -26343,37 +26343,37 @@ define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_fcmp_false_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_false_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_false_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_false_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_false_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_false_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp false bfloat %a, %b
   ret i1 %op
@@ -26387,8 +26387,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_oeq_bf16:
@@ -26398,8 +26397,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_oeq_bf16:
@@ -26407,8 +26405,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_oeq_bf16:
@@ -26416,8 +26413,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_oeq_bf16:
@@ -26425,8 +26421,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_oeq_bf16:
@@ -26435,8 +26430,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp oeq bfloat %a, %b
   ret i1 %op
@@ -26450,8 +26444,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ogt_bf16:
@@ -26461,8 +26454,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ogt_bf16:
@@ -26470,8 +26462,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ogt_bf16:
@@ -26479,8 +26470,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ogt_bf16:
@@ -26488,8 +26478,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ogt_bf16:
@@ -26498,8 +26487,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ogt bfloat %a, %b
   ret i1 %op
@@ -26513,8 +26501,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_oge_bf16:
@@ -26524,8 +26511,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_oge_bf16:
@@ -26533,8 +26519,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_oge_bf16:
@@ -26542,8 +26527,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_oge_bf16:
@@ -26551,8 +26535,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_oge_bf16:
@@ -26561,8 +26544,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp oge bfloat %a, %b
   ret i1 %op
@@ -26576,8 +26558,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_olt_bf16:
@@ -26587,8 +26568,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_olt_bf16:
@@ -26596,8 +26576,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_olt_bf16:
@@ -26605,8 +26584,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_olt_bf16:
@@ -26614,8 +26592,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_olt_bf16:
@@ -26624,8 +26601,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp olt bfloat %a, %b
   ret i1 %op
@@ -26639,8 +26615,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ole_bf16:
@@ -26650,8 +26625,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ole_bf16:
@@ -26659,8 +26633,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ole_bf16:
@@ -26668,8 +26641,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ole_bf16:
@@ -26677,8 +26649,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ole_bf16:
@@ -26687,8 +26658,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_le_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ole bfloat %a, %b
   ret i1 %op
@@ -26702,8 +26672,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_one_bf16:
@@ -26713,8 +26682,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_one_bf16:
@@ -26722,8 +26690,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_one_bf16:
@@ -26731,8 +26698,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_one_bf16:
@@ -26740,8 +26706,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lg_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_one_bf16:
@@ -26750,8 +26715,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lg_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp one bfloat %a, %b
   ret i1 %op
@@ -26765,8 +26729,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_uno_bf16:
@@ -26776,8 +26739,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_uno_bf16:
@@ -26785,8 +26747,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_uno_bf16:
@@ -26794,8 +26755,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_uno_bf16:
@@ -26803,8 +26763,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_uno_bf16:
@@ -26813,8 +26772,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp uno bfloat %a, %b
   ret i1 %op
@@ -26828,8 +26786,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ueq_bf16:
@@ -26839,8 +26796,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ueq_bf16:
@@ -26848,8 +26804,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ueq_bf16:
@@ -26857,8 +26812,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ueq_bf16:
@@ -26866,8 +26820,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlg_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ueq_bf16:
@@ -26876,8 +26829,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlg_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ueq bfloat %a, %b
   ret i1 %op
@@ -26891,8 +26843,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ugt_bf16:
@@ -26902,8 +26853,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ugt_bf16:
@@ -26911,8 +26861,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ugt_bf16:
@@ -26920,8 +26869,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ugt_bf16:
@@ -26929,8 +26877,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nle_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ugt_bf16:
@@ -26939,8 +26886,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nle_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ugt bfloat %a, %b
   ret i1 %op
@@ -26954,8 +26900,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_uge_bf16:
@@ -26965,8 +26910,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_uge_bf16:
@@ -26974,8 +26918,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_uge_bf16:
@@ -26983,8 +26926,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_uge_bf16:
@@ -26992,8 +26934,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_uge_bf16:
@@ -27002,8 +26943,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp uge bfloat %a, %b
   ret i1 %op
@@ -27017,8 +26957,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ult_bf16:
@@ -27028,8 +26967,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ult_bf16:
@@ -27037,8 +26975,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ult_bf16:
@@ -27046,8 +26983,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ult_bf16:
@@ -27055,8 +26991,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nge_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ult_bf16:
@@ -27065,8 +27000,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ult bfloat %a, %b
   ret i1 %op
@@ -27080,8 +27014,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ule_bf16:
@@ -27091,8 +27024,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ule_bf16:
@@ -27100,8 +27032,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ule_bf16:
@@ -27109,8 +27040,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ule_bf16:
@@ -27118,8 +27048,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ule_bf16:
@@ -27128,8 +27057,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ule bfloat %a, %b
   ret i1 %op
@@ -27143,8 +27071,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_une_bf16:
@@ -27154,8 +27081,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_une_bf16:
@@ -27163,8 +27089,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_une_bf16:
@@ -27172,8 +27097,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_une_bf16:
@@ -27181,8 +27105,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_une_bf16:
@@ -27191,8 +27114,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp une bfloat %a, %b
   ret i1 %op
@@ -27202,37 +27124,37 @@ define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_fcmp_true_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_mov_b64 s[0:1], -1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_true_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, 1
+; GFX7-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_true_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, 1
+; GFX8-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_true_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_true_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    s_mov_b32 s0, -1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_true_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp true bfloat %a, %b
   ret i1 %op
@@ -33554,56 +33476,39 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, bfloat %a, bfloat %b
   ret bfloat %op
@@ -33613,60 +33518,46 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_fneg_lhs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_fneg_lhs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_fneg_lhs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_lhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_lhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg bfloat %a
   %op = select i1 %cond, bfloat %neg.a, bfloat %b
@@ -33677,60 +33568,46 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_fneg_rhs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, -1.0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_fneg_rhs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v2, -1.0, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_fneg_rhs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_rhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_rhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg bfloat %b
   %op = select i1 %cond, bfloat %a, bfloat %neg.b
@@ -33741,89 +33618,69 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GCN-LABEL: v_select_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v2, s4
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v2, s0
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
@@ -34205,22 +34062,14 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GCN-LABEL: v_select_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v1, v4, v3, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, v2, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -34229,22 +34078,14 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX7-LABEL: v_select_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v3, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v5, v2, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -34253,37 +34094,29 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX8-LABEL: v_select_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
   ret <3 x bfloat> %op
@@ -34293,26 +34126,16 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GCN-LABEL: v_select_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT:    v_alignbit_b32 v4, v8, v7, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v7, v6, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34322,26 +34145,16 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX7-LABEL: v_select_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v8
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v5, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v5, 16
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34351,37 +34164,29 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX8-LABEL: v_select_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -34391,35 +34196,21 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GCN-LABEL: v_select_v6bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v8, v7, 16
-; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT:    v_alignbit_b32 v4, v10, v9, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v6, v12, v11, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v9, v8, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v5, v11, v10, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34431,35 +34222,21 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX7-LABEL: v_select_v6bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v9
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v12
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v11
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v7, 16
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v10, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34471,41 +34248,33 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX8-LABEL: v_select_v6bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v6bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v6bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v6bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
   ret <6 x bfloat> %op
@@ -34515,44 +34284,26 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GCN-LABEL: v_select_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v10, v9, 16
-; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT:    v_alignbit_b32 v4, v12, v11, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v6, v14, v13, 16
-; GCN-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GCN-NEXT:    v_alignbit_b32 v8, v16, v15, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v1, v9, v8, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v11, v10, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v5, v13, v12, 16
+; GCN-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v7, v15, v14, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34566,44 +34317,26 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX7-LABEL: v_select_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v9, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v11
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v14
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v9, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v13
-; GFX7-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v16
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v9, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v15
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v8, v8, v9, 16
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v10, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v12, 16
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v14, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34617,44 +34350,37 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX8-LABEL: v_select_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -34664,81 +34390,44 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GCN-LABEL: v_select_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v18
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v17
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_alignbit_b32 v2, v2, v3, 16
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v20
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v19
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GCN-NEXT:    v_alignbit_b32 v1, v1, v16, 16
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_alignbit_b32 v3, v3, v4, 16
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_alignbit_b32 v4, v4, v5, 16
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v22
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v21
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v18, 16
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_alignbit_b32 v5, v5, v6, 16
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v8
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_alignbit_b32 v6, v6, v7, 16
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v24
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v23
+; GCN-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v5, v5, v20, 16
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_alignbit_b32 v7, v7, v8, 16
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v10
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_alignbit_b32 v8, v8, v9, 16
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v26
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v25
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v27
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v30
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v29
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GCN-NEXT:    v_alignbit_b32 v7, v7, v22, 16
 ; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_alignbit_b32 v9, v9, v10, 16
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32
-; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_alignbit_b32 v17, v17, v18, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; GCN-NEXT:    v_alignbit_b32 v14, v19, v20, 16
-; GCN-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; GCN-NEXT:    v_cndmask_b32_e32 v13, v14, v13, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v10
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v12
+; GCN-NEXT:    v_alignbit_b32 v8, v9, v8, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v24, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v27
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_alignbit_b32 v10, v11, v10, 16
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GCN-NEXT:    v_alignbit_b32 v11, v16, v26, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v13, v12, 16
+; GCN-NEXT:    v_alignbit_b32 v13, v17, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v14, v15, v14, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v8, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34753,9 +34442,10 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_alignbit_b32 v14, v14, v16, 16
-; GCN-NEXT:    v_cndmask_b32_e32 v15, v14, v15, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GCN-NEXT:    v_alignbit_b32 v15, v15, v30, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v15, v15, v14, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -34763,77 +34453,44 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX7-LABEL: v_select_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v18
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v20
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v17, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v19
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v22
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v17, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v21
-; GFX7-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v24
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v17, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v23
-; GFX7-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v26
-; GFX7-NEXT:    v_alignbit_b32 v8, v8, v17, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v25
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT:    v_alignbit_b32 v10, v10, v17, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v28
-; GFX7-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v27
-; GFX7-NEXT:    v_alignbit_b32 v17, v17, v18, 16
-; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v30
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v29
-; GFX7-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v14, v14, v19, 16
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v13, v14, v13, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v10, v11, v10, 16
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v27
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v29
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v16, v16, v26, 16
+; GFX7-NEXT:    v_alignbit_b32 v12, v13, v12, 16
+; GFX7-NEXT:    v_alignbit_b32 v13, v17, v28, 16
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GFX7-NEXT:    v_alignbit_b32 v8, v9, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v16, v10, s[4:5]
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v18, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v20, 16
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v22, 16
+; GFX7-NEXT:    v_alignbit_b32 v9, v9, v24, 16
+; GFX7-NEXT:    v_alignbit_b32 v14, v15, v14, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34844,15 +34501,12 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v18
-; GFX7-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX7-NEXT:    v_cndmask_b32_e32 v15, v12, v15, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v10, v10, v30, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v10, v14, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
@@ -34862,58 +34516,53 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX8-LABEL: v_select_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -34923,220 +34572,152 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GCN-LABEL: v_select_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v3
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_alignbit_b32 v2, v2, v3, 16
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v8
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v7
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_alignbit_b32 v3, v3, v4, 16
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v10
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_alignbit_b32 v4, v4, v5, 16
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v12
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v11
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_alignbit_b32 v5, v5, v6, 16
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v14
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v13
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_alignbit_b32 v6, v6, v7, 16
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v16
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v15
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_alignbit_b32 v7, v7, v8, 16
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v18
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v17
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_alignbit_b32 v8, v8, v9, 16
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v20
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v19
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_alignbit_b32 v9, v9, v10, 16
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v22
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v21
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_alignbit_b32 v10, v10, v11, 16
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v24
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v23
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_alignbit_b32 v11, v11, v12, 16
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:20
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v26
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v25
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_alignbit_b32 v12, v12, v13, 16
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v27
-; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_alignbit_b32 v13, v13, v14, 16
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:28
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v30
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v29
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_alignbit_b32 v14, v14, v20, 16
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:24
-; GCN-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v5, v5, v10, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v6, v6, v12, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; GCN-NEXT:    v_alignbit_b32 v7, v7, v14, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v10, v10, v20, 16
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v22, 16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v25
+; GCN-NEXT:    v_alignbit_b32 v12, v12, v24, 16
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v27
+; GCN-NEXT:    v_alignbit_b32 v13, v13, v26, 16
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v29
+; GCN-NEXT:    v_alignbit_b32 v14, v14, v28, 16
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:24
 ; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_alignbit_b32 v15, v15, v16, 16
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36
-; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v17
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v18
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_alignbit_b32 v16, v16, v17, 16
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v19
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v17
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v20
-; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_alignbit_b32 v17, v17, v19, 16
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40
-; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GCN-NEXT:    v_alignbit_b32 v18, v20, v18, 16
+; GCN-NEXT:    v_alignbit_b32 v17, v17, v20, 16
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:40
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v18, v18, v19, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:48
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT:    v_alignbit_b32 v19, v19, v20, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v22
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_alignbit_b32 v19, v19, v21, 16
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:52
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v20, v20, v21, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v22
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v23
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:68
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:64
-; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GCN-NEXT:    v_alignbit_b32 v21, v21, v22, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v23
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v24
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_alignbit_b32 v21, v21, v23, 16
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v22, v22, v23, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v24
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v25
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:84
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:80
-; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GCN-NEXT:    v_alignbit_b32 v23, v23, v24, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v24
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:80
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v25
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v26
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:92
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_alignbit_b32 v23, v23, v25, 16
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:88
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v24, v24, v25, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v26
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v27
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:100
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:96
-; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GCN-NEXT:    v_alignbit_b32 v25, v25, v26, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v26
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v27
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v28
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_alignbit_b32 v25, v25, v27, 16
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:104
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v26, v26, v27, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v28
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v29
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:116
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:112
-; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GCN-NEXT:    v_alignbit_b32 v27, v27, v28, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v28
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:112
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v30
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_alignbit_b32 v27, v27, v29, 16
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v28, v28, v29, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v31
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GCN-NEXT:    v_alignbit_b32 v29, v29, v30, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v31
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v31
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GCN-NEXT:    v_alignbit_b32 v30, v30, v31, 16
+; GCN-NEXT:    v_alignbit_b32 v29, v29, v32, 16
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:    v_alignbit_b32 v30, v31, v30, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GCN-NEXT:    v_alignbit_b32 v31, v31, v32, 16
-; GCN-NEXT:    v_cndmask_b32_e32 v31, v31, v30, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v29, v29, v14, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v28, v28, v13, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v27, v27, v12, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v26, v26, v11, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v25, v25, v10, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v24, v24, v9, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v23, v23, v8, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v22, v22, v7, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v13, v21, v6, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v11, v20, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v19, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v18, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v17, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v16, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v15, v0, vcc
+; GCN-NEXT:    v_alignbit_b32 v31, v31, v33, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v31, v31, v30, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v29, v29, v14, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v28, v28, v13, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v27, v27, v12, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v26, v26, v11, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v25, v25, v10, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v24, v24, v9, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v23, v23, v8, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v22, v22, v7, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v13, v21, v6, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v11, v20, v5, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v19, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v18, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v17, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v16, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v15, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -35174,244 +34755,174 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX7-LABEL: v_select_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v3, 16
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v5, 16
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v9
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT:    v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; GFX7-NEXT:    v_alignbit_b32 v27, v28, v27, 16
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT:    v_alignbit_b32 v23, v24, v23, 16
-; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT:    v_alignbit_b32 v19, v20, v19, 16
-; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_alignbit_b32 v25, v26, v25, 16
-; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT:    v_alignbit_b32 v29, v30, v29, 16
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
-; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52
-; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:68
-; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
-; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
-; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
-; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v10, 16
+; GFX7-NEXT:    v_alignbit_b32 v6, v6, v12, 16
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v14, 16
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
+; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX7-NEXT:    v_alignbit_b32 v24, v25, v24, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    v_alignbit_b32 v20, v21, v20, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT:    v_alignbit_b32 v28, v29, v28, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT:    v_alignbit_b32 v18, v19, v18, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    v_alignbit_b32 v22, v23, v22, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT:    v_alignbit_b32 v26, v27, v26, 16
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    s_waitcnt vmcnt(13)
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(12)
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v7, 16
-; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(12)
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GFX7-NEXT:    s_waitcnt vmcnt(11)
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    v_alignbit_b32 v9, v9, v10, 16
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
 ; GFX7-NEXT:    s_waitcnt vmcnt(9)
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
 ; GFX7-NEXT:    s_waitcnt vmcnt(7)
-; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT:    s_waitcnt vmcnt(6)
-; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v7, v7, v8, 16
-; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    v_alignbit_b32 v10, v10, v11, 16
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    v_alignbit_b32 v8, v8, v9, 16
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v11, v11, v12, 16
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX7-NEXT:    v_alignbit_b32 v9, v9, v10, 16
-; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
-; GFX7-NEXT:    v_cndmask_b32_e32 v9, v9, v4, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    v_alignbit_b32 v12, v12, v13, 16
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    v_alignbit_b32 v10, v10, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT:    v_cndmask_b32_e32 v10, v10, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v8, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v13, v14, 16
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v4, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    v_alignbit_b32 v14, v14, v15, 16
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v11, v2, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v12, v12, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v14, v14, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    v_cndmask_b32_e32 v13, v14, v13, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v6, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v16, v16, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT:    v_alignbit_b32 v16, v16, v17, 16
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v16, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v12, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v10, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v0, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v18, v18, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_alignbit_b32 v17, v17, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v17, v8, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v20, v20, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX7-NEXT:    v_alignbit_b32 v19, v19, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v22, v22, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX7-NEXT:    v_alignbit_b32 v21, v21, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, v21, v20, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v24, v24, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX7-NEXT:    v_alignbit_b32 v23, v23, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, v23, v22, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v26, v26, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT:    v_cndmask_b32_e32 v25, v26, v25, vcc
+; GFX7-NEXT:    v_alignbit_b32 v25, v25, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_cndmask_b32_e64 v25, v25, v24, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v28, v28, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT:    v_cndmask_b32_e32 v27, v28, v27, vcc
+; GFX7-NEXT:    v_alignbit_b32 v27, v27, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cndmask_b32_e64 v27, v27, v26, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_alignbit_b32 v30, v30, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_cndmask_b32_e32 v29, v30, v29, vcc
+; GFX7-NEXT:    v_alignbit_b32 v29, v29, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v28, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX7-NEXT:    v_alignbit_b32 v31, v31, v32, 16
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    v_alignbit_b32 v30, v31, v30, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX7-NEXT:    v_alignbit_b32 v32, v32, v33, 16
-; GFX7-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT:    v_alignbit_b32 v31, v31, v32, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v30, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
 ; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -35419,103 +34930,93 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX8-LABEL: v_select_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v16, v15, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v16, v15, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index 8766303d7ee6e..283cbd6aa61c6 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -45,13 +45,13 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:               ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i8_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i8_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -107,12 +107,12 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -168,12 +168,12 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -228,13 +228,13 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -289,13 +289,13 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -350,14 +350,14 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -412,15 +412,15 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -475,19 +475,19 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s16, s18
-; GFX9-NEXT:    s_mov_b32 s17, s19
-; GFX9-NEXT:    s_mov_b32 s18, s20
-; GFX9-NEXT:    s_mov_b32 s19, s21
+; GFX9-NEXT:    s_mov_b32 s23, s21
+; GFX9-NEXT:    s_mov_b32 s22, s20
+; GFX9-NEXT:    s_mov_b32 s21, s19
+; GFX9-NEXT:    s_mov_b32 s20, s18
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[22:23]
-; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT:    s_getpc_b64 s[24:25]
+; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -553,12 +553,12 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -614,12 +614,12 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_bf16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_bf16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_bf16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -675,12 +675,12 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -735,13 +735,13 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -797,12 +797,12 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -859,12 +859,12 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2bf16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2bf16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2bf16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -919,13 +919,13 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -980,13 +980,13 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v4f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v4f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1041,13 +1041,13 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p0_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p0_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p0_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1102,13 +1102,13 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p1_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p1_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1164,12 +1164,12 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p3_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p3_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p3_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1224,15 +1224,15 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p1_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p1_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1287,13 +1287,13 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2p5_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2p5_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p5_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1348,16 +1348,16 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s16, s18
+; GFX9-NEXT:    s_mov_b32 s20, s18
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[20:21]
-; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    s_getpc_b64 s[22:23]
+; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1412,23 +1412,24 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, vcc
 ; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s16, s18
-; GFX9-NEXT:    s_mov_b32 s17, s19
-; GFX9-NEXT:    s_mov_b32 s18, s20
-; GFX9-NEXT:    s_mov_b32 s19, s21
-; GFX9-NEXT:    s_mov_b32 s20, s22
-; GFX9-NEXT:    s_mov_b32 s21, s23
-; GFX9-NEXT:    s_mov_b32 s22, s24
-; GFX9-NEXT:    s_mov_b32 s23, s25
-; GFX9-NEXT:    s_mov_b32 s24, s26
-; GFX9-NEXT:    s_mov_b32 s25, s27
-; GFX9-NEXT:    s_mov_b32 s26, s28
+; GFX9-NEXT:    s_mov_b32 s30, s28
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s29, s27
+; GFX9-NEXT:    s_mov_b32 s28, s26
+; GFX9-NEXT:    s_mov_b32 s27, s25
+; GFX9-NEXT:    s_mov_b32 s26, s24
+; GFX9-NEXT:    s_mov_b32 s25, s23
+; GFX9-NEXT:    s_mov_b32 s24, s22
+; GFX9-NEXT:    s_mov_b32 s23, s21
+; GFX9-NEXT:    s_mov_b32 s22, s20
+; GFX9-NEXT:    s_mov_b32 s21, s19
+; GFX9-NEXT:    s_mov_b32 s20, s18
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s30
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 vcc
 ; GFX9-NEXT:    s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg at rel32@lo+4
@@ -1513,22 +1514,6 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s7
-; GFX9-NEXT:    s_mov_b32 s2, s6
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, s8
-; GFX9-NEXT:    s_mov_b32 s5, s9
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
-; GFX9-NEXT:    s_mov_b32 s8, s15
-; GFX9-NEXT:    s_mov_b32 s9, s16
-; GFX9-NEXT:    s_mov_b32 s10, s17
-; GFX9-NEXT:    s_mov_b32 s11, s18
-; GFX9-NEXT:    s_mov_b32 s15, s19
-; GFX9-NEXT:    s_mov_b32 s16, s20
-; GFX9-NEXT:    s_mov_b32 s17, s21
-; GFX9-NEXT:    s_mov_b32 s18, s22
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[24:25]
 ; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 725c2d71ac5e3..5882ae7a31c09 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT:    v_mov_b32_e32 v0, 1
+; VI-NEXT:    s_mov_b64 s[4:5], -1
 ; VI-NEXT:    s_mov_b32 s32, 0
-; VI-NEXT:    s_getpc_b64 s[4:5]
-; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_getpc_b64 s[6:7]
+; VI-NEXT:    s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: test_call_external_void_func_i1_imm:
@@ -89,12 +89,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    s_mov_b64 s[4:5], -1
 ; CI-NEXT:    s_mov_b32 s32, 0
-; CI-NEXT:    s_getpc_b64 s[4:5]
-; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_getpc_b64 s[6:7]
+; CI-NEXT:    s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_call_external_void_func_i1_imm:
@@ -107,23 +107,23 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_imm:
@@ -131,14 +131,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    v_mov_b32_e32 v0, 1
+; HSA-NEXT:    s_mov_b64 s[4:5], -1
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_getpc_b64 s[4:5]
-; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    s_getpc_b64 s[6:7]
+; HSA-NEXT:    s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_i1(i1 true)
   ret void
@@ -160,11 +160,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
-; VI-NEXT:    s_getpc_b64 s[4:5]
-; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; VI-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_getpc_b64 s[6:7]
+; VI-NEXT:    s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; VI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: test_call_external_void_func_i1_signext:
@@ -182,11 +183,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
-; CI-NEXT:    s_getpc_b64 s[4:5]
-; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; CI-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_getpc_b64 s[6:7]
+; CI-NEXT:    s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; CI-NEXT:    v_and_b32_e32 v0, 1, v0
+; CI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_call_external_void_func_i1_signext:
@@ -204,11 +206,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_signext:
@@ -218,11 +221,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_signext:
@@ -237,12 +242,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[4:5]
-; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; HSA-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    s_getpc_b64 s[6:7]
+; HSA-NEXT:    s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; HSA-NEXT:    v_and_b32_e32 v0, 1, v0
+; HSA-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; HSA-NEXT:    s_endpgm
+
   %var = load volatile i1, ptr addrspace(1) undef
   call void @external_void_func_i1_signext(i1 signext %var)
   ret void
@@ -265,11 +272,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
-; VI-NEXT:    s_getpc_b64 s[4:5]
-; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; VI-NEXT:    s_getpc_b64 s[6:7]
+; VI-NEXT:    s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
 ; VI-NEXT:    v_and_b32_e32 v0, 1, v0
-; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; VI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: test_call_external_void_func_i1_zeroext:
@@ -287,11 +295,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
-; CI-NEXT:    s_getpc_b64 s[4:5]
-; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; CI-NEXT:    s_getpc_b64 s[6:7]
+; CI-NEXT:    s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
 ; CI-NEXT:    v_and_b32_e32 v0, 1, v0
-; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
@@ -309,11 +318,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
@@ -323,11 +333,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_zeroext:
@@ -342,11 +354,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[4:5]
-; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; HSA-NEXT:    s_getpc_b64 s[6:7]
+; HSA-NEXT:    s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
 ; HSA-NEXT:    v_and_b32_e32 v0, 1, v0
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; HSA-NEXT:    s_endpgm
   %var = load volatile i1, ptr addrspace(1) undef
   call void @external_void_func_i1_zeroext(i1 zeroext %var)
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 10d71a315fbf9..66a04ed26ddb7 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -11,8 +11,7 @@ define i1 @test1(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i32 %arg2, 1000
@@ -25,8 +24,7 @@ define i1 @test2(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 1000
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -39,8 +37,7 @@ define i1 @test3(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e9, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, 1000
   %cmp2 = icmp sle i32 %arg2, 1000
@@ -53,8 +50,7 @@ define i1 @test4(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e9, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, 1000
   %cmp2 = icmp ule i32 %arg2, 1000
@@ -67,8 +63,7 @@ define i1 @test5(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, 1000
   %cmp2 = icmp sgt i32 %arg2, 1000
@@ -81,8 +76,7 @@ define i1 @test6(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, 1000
   %cmp2 = icmp ugt i32 %arg2, 1000
@@ -95,8 +89,7 @@ define i1 @test7(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, 0x3e7, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, 1000
   %cmp2 = icmp sge i32 %arg2, 1000
@@ -109,8 +102,7 @@ define i1 @test8(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, 0x3e7, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, 1000
   %cmp2 = icmp uge i32 %arg2, 1000
@@ -123,8 +115,7 @@ define i1 @test9(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -137,8 +128,7 @@ define i1 @test10(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -151,8 +141,7 @@ define i1 @test11(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, %arg3
   %cmp2 = icmp sle i32 %arg2, %arg3
@@ -165,8 +154,7 @@ define i1 @test12(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -179,8 +167,7 @@ define i1 @test13(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -193,8 +180,7 @@ define i1 @test14(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, %arg3
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -207,8 +193,7 @@ define i1 @test15(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, %arg3
   %cmp2 = icmp sge i32 %arg2, %arg3
@@ -221,8 +206,7 @@ define i1 @test16(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, %arg3
   %cmp2 = icmp uge i32 %arg2, %arg3
@@ -235,8 +219,7 @@ define i1 @test17(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i32 %arg2, 1000
@@ -249,8 +232,7 @@ define i1 @test18(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 1000
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -263,8 +245,7 @@ define i1 @test19(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e9, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, 1000
   %cmp2 = icmp sle i32 %arg2, 1000
@@ -277,8 +258,7 @@ define i1 @test20(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e9, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, 1000
   %cmp2 = icmp ule i32 %arg2, 1000
@@ -291,8 +271,7 @@ define i1 @test21(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, 1000
   %cmp2 = icmp sgt i32 %arg2, 1000
@@ -305,8 +284,7 @@ define i1 @test22(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, 1000
   %cmp2 = icmp ugt i32 %arg2, 1000
@@ -319,8 +297,7 @@ define i1 @test23(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, 0x3e7, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, 1000
   %cmp2 = icmp sge i32 %arg2, 1000
@@ -333,8 +310,7 @@ define i1 @test24(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, 0x3e7, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, 1000
   %cmp2 = icmp uge i32 %arg2, 1000
@@ -347,8 +323,7 @@ define i1 @test25(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -361,8 +336,7 @@ define i1 @test26(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -375,8 +349,7 @@ define i1 @test27(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, %arg3
   %cmp2 = icmp sle i32 %arg2, %arg3
@@ -389,8 +362,7 @@ define i1 @test28(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -403,8 +375,7 @@ define i1 @test29(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -417,8 +388,7 @@ define i1 @test30(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, %arg3
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -431,8 +401,7 @@ define i1 @test31(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, %arg3
   %cmp2 = icmp sge i32 %arg2, %arg3
@@ -445,8 +414,7 @@ define i1 @test32(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, %arg3
   %cmp2 = icmp uge i32 %arg2, %arg3
@@ -459,8 +427,7 @@ define i1 @test33(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v1, 0x3e8, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v1, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg2
   %cmp2 = icmp slt i32 %arg1, 1000
@@ -633,8 +600,7 @@ define i1 @test42(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg3, %arg1
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -647,8 +613,7 @@ define i1 @test43(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg3, %arg1
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -661,8 +626,7 @@ define i1 @test44(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg3, %arg2
@@ -675,8 +639,7 @@ define i1 @test45(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg3, %arg2
@@ -689,8 +652,7 @@ define i1 @test46(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg3, %arg1
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -703,8 +665,7 @@ define i1 @test47(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg3, %arg2
@@ -717,8 +678,7 @@ define i1 @test48(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg3, %arg2
@@ -731,8 +691,7 @@ define i1 @test49(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg3, %arg1
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -745,8 +704,7 @@ define i1 @test50(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg3, %arg1
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -759,8 +717,7 @@ define i1 @test51(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg3, %arg2
@@ -773,8 +730,7 @@ define i1 @test52(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg3, %arg2
@@ -787,8 +743,7 @@ define i1 @test53(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg3, %arg1
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -801,8 +756,7 @@ define i1 @test54(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp olt float %arg2, %arg3
@@ -815,8 +769,7 @@ define i1 @test55(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ole double %arg1, %arg3
   %cmp2 = fcmp ole double %arg2, %arg3
@@ -829,8 +782,7 @@ define i1 @test56(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt double %arg1, %arg3
   %cmp2 = fcmp ogt double %arg2, %arg3
@@ -843,8 +795,7 @@ define i1 @test57(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp oge float %arg1, %arg3
   %cmp2 = fcmp oge float %arg2, %arg3
@@ -857,16 +808,14 @@ define i1 @test58(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test58:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ugt double %arg1, %arg3
   %cmp2 = fcmp ugt double %arg2, %arg3
@@ -879,16 +828,14 @@ define i1 @test59(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test59:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp uge float %arg1, %arg3
   %cmp2 = fcmp uge float %arg2, %arg3
@@ -901,16 +848,14 @@ define i1 @test60(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test60:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ule float %arg1, %arg3
   %cmp2 = fcmp ule float %arg2, %arg3
@@ -923,16 +868,14 @@ define i1 @test61(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test61:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -946,8 +889,7 @@ define i1 @test62(float %arg1, float %arg2, float %arg3) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -964,8 +906,7 @@ define i1 @test63(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -982,8 +923,7 @@ define i1 @test64(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -999,8 +939,7 @@ define i1 @test65(float %arg1, float %arg2, float %arg3) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1017,8 +956,7 @@ define i1 @test66(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1034,8 +972,7 @@ define i1 @test67(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1051,8 +988,7 @@ define i1 @test68(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1069,8 +1005,7 @@ define i1 @test69(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1086,16 +1021,14 @@ define i1 @test70(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test70:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1112,8 +1045,7 @@ define i1 @test71(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1130,8 +1062,7 @@ define i1 @test72(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1147,16 +1078,14 @@ define i1 @test73(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test73:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1173,8 +1102,7 @@ define i1 @test74(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test74:
@@ -1183,8 +1111,7 @@ define i1 @test74(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1200,16 +1127,14 @@ define i1 @test75(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test75:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1225,16 +1150,14 @@ define i1 @test76(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test76:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1251,8 +1174,7 @@ define i1 @test77(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test77:
@@ -1261,8 +1183,7 @@ define i1 @test77(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1277,8 +1198,7 @@ define i1 @test78(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp ogt float %arg3, %arg2
@@ -1291,16 +1211,14 @@ define i1 @test79(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test79:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %arg3
   %cmp2 = fcmp ugt float %arg3, %arg2
@@ -1314,8 +1232,7 @@ define i1 @test80(float %arg1, float %arg2, float %arg3) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1332,8 +1249,7 @@ define i1 @test81(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1350,8 +1266,7 @@ define i1 @test82(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1367,16 +1282,14 @@ define i1 @test83(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test83:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1393,16 +1306,14 @@ define i1 @test84(half %arg1, half %arg2, half %arg3) {
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f16_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test84:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f16_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1487,16 +1398,14 @@ define i1 @test87(half %arg1, half %arg2, half %arg3) {
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test87:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f16_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1547,16 +1456,14 @@ define i1 @test89(half %arg1, half %arg2, half %arg3) {
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f16_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test89:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f16_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1573,16 +1480,14 @@ define i1 @test90(half %arg1, half %arg2, half %arg3) {
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test90:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f16_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1631,8 +1536,7 @@ define i1 @test92(i32 %arg1, i32 %arg2, i32 %arg3, i32 %C) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1651,7 +1555,6 @@ define i1 @test93(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v1, v4
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1671,8 +1574,7 @@ define i1 @test94(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %ar
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GCN-NEXT:    v_min3_u32 v0, v5, v6, v0
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1697,8 +1599,7 @@ define i1 @test95(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_maxmin_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1713,8 +1614,7 @@ define i1 @test96(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_minmax_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1730,8 +1630,7 @@ define i1 @test97(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_max3_u32 v0, v0, v2, v3
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1749,8 +1648,7 @@ define i1 @test98(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_minmax_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1768,8 +1666,7 @@ define i1 @test99(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1787,8 +1684,7 @@ define i1 @test100(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_maxmin_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1807,8 +1703,7 @@ define i1 @test101(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_minmax_u32 v1, v3, v4, v5
 ; GCN-NEXT:    v_min3_u32 v0, v0, v2, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v6
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1831,8 +1726,7 @@ define i1 @test102(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_min_u32_e32 v1, v2, v3
 ; GCN-NEXT:    v_min3_u32 v0, v0, v5, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v6
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1859,7 +1753,6 @@ define i1 @test103(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v2, v6
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v6
 ; GCN-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1892,7 +1785,6 @@ define i1 @test104(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1931,7 +1823,6 @@ define i1 @test105(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
 ; GCN-NEXT:    s_and_b32 s0, s0, s1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1968,7 +1859,6 @@ define i1 @test106(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_or_b32 s0, s2, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C1
   %cmp2 = icmp ult i32 %arg2, %C1
@@ -2001,8 +1891,7 @@ define i1 @test107(float %arg1, float %arg2, float %arg3, float %C) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2017,16 +1906,14 @@ define i1 @test108(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
-; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test108:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v3
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %C
   %cmp2 = fcmp ult float %arg2, %C
@@ -2046,7 +1933,6 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test109:
@@ -2056,7 +1942,6 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
 ; GFX11NONANS-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2078,7 +1963,6 @@ define i1 @test110(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GCN-NEXT:    v_cmp_gt_f32_e64 s0, v1, v8
 ; GCN-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2099,12 +1983,12 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_min_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX11-NEXT:    v_max_f32_e32 v1, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min3_f32 v0, v5, v6, v0
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test111:
@@ -2114,8 +1998,7 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v5, v6, v0
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2141,13 +2024,13 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v4, v8
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_min_f32 v2, v2, v3
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX11-NEXT:    v_max_f32_e32 v3, v6, v6
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT:    v_min3_f32 v0, v0, v5, v3
+; GFX11-NEXT:    v_max_f32_e32 v1, v5, v5
+; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test112:
@@ -2157,8 +2040,7 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v5, v6, v0
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2187,15 +2069,13 @@ define i1 @test113(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v3
 ; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test113:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v3
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %C
   %cmp2 = fcmp ult float %arg2, %C
@@ -2214,7 +2094,6 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
 ; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test114:
@@ -2224,7 +2103,6 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
 ; GFX11NONANS-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt float %arg1, %C
   %cmp2 = fcmp ogt float %arg2, %C
@@ -2244,7 +2122,6 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v1, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test115:
@@ -2252,8 +2129,7 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v4
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2287,7 +2163,6 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test116:
@@ -2304,7 +2179,6 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11NONANS-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2348,7 +2222,6 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_or_b32 s0, s2, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test117:
@@ -2366,7 +2239,6 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s2, s0
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C1
   %cmp2 = fcmp olt float %arg2, %C1
@@ -2403,8 +2275,7 @@ define i1 @test118(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_max3_f32 v0, v0, v2, v3
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2428,8 +2299,7 @@ define i1 @test119(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_minmax_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2453,8 +2323,7 @@ define i1 @test120(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2478,8 +2347,7 @@ define i1 @test121(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2500,8 +2368,7 @@ define i1 @test122(double %arg1, double %arg2, double %arg3) #1 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -2516,8 +2383,7 @@ define i1 @test123(double %arg1, double %arg2, double %arg3) #1 {
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -2536,7 +2402,6 @@ define i1 @test124(i32 %arg1, i64 %arg2) {
 ; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0x3e8, v[1:2]
 ; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i64 %arg2, 1000
@@ -2551,7 +2416,6 @@ define i1 @test125(i32 %arg1, i32 %arg2) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp eq i32 %arg1, 1000
   %cmp2 = icmp eq i32 %arg2, 1000
@@ -2566,7 +2430,6 @@ define i1 @test126(i32 %arg1, i32 %arg2) {
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ne i32 %arg1, 1000
   %cmp2 = icmp ne i32 %arg2, 1000
@@ -2581,7 +2444,6 @@ define i1 @test127(i64 %arg1, i64 %arg2, i64 %arg3) {
 ; GCN-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[4:5]
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %cmp1 = icmp ult i64 %arg1, %arg3
    %cmp2 = icmp ult i64 %arg2, %arg3
@@ -2596,7 +2458,6 @@ define i1 @test128(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v2, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -2611,7 +2472,6 @@ define i1 @test129(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cmp_le_u32_e64 s0, v1, v2
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -2626,7 +2486,6 @@ define i1 @test130(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v2, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v1, v2
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -2641,7 +2500,6 @@ define i1 @test131(i16 %arg1, i32 %arg2) {
 ; GCN-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 10, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 10, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i16 %arg1, 10
   %cmp2 = icmp ult i32 %arg2, 10
@@ -2659,7 +2517,6 @@ define i1 @test132(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    s_or_b32 s1, s1, vcc_lo
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -2677,7 +2534,6 @@ define i1 @test133(i32 %arg1, i32 %arg2) {
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 100
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -2692,15 +2548,13 @@ define i1 @test134(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test134:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp ogt float %arg3, %arg2
@@ -2715,15 +2569,13 @@ define i1 @test135(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nle_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test135:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %arg3
   %cmp2 = fcmp ugt float %arg3, %arg2
@@ -2740,7 +2592,6 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, v[4:5], v[2:3]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test136:
@@ -2749,8 +2600,7 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -2768,15 +2618,13 @@ define i1 @test137(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test137:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -2793,15 +2641,13 @@ define i1 @test138(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test138:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp olt float %arg2, %arg3
@@ -2816,15 +2662,13 @@ define i1 @test139(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_le_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test139:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ole double %arg1, %arg3
   %cmp2 = fcmp ole double %arg2, %arg3
@@ -2839,15 +2683,13 @@ define i1 @test140(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test140:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt double %arg1, %arg3
   %cmp2 = fcmp ogt double %arg2, %arg3
@@ -2862,15 +2704,13 @@ define i1 @test141(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test141:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp oge float %arg1, %arg3
   %cmp2 = fcmp oge float %arg2, %arg3
@@ -2885,15 +2725,13 @@ define i1 @test142(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test142:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ugt double %arg1, %arg3
   %cmp2 = fcmp ugt double %arg2, %arg3
@@ -2908,15 +2746,13 @@ define i1 @test143(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test143:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp uge float %arg1, %arg3
   %cmp2 = fcmp uge float %arg2, %arg3
@@ -2931,15 +2767,13 @@ define i1 @test144(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test144:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ule float %arg1, %arg3
   %cmp2 = fcmp ule float %arg2, %arg3
@@ -2954,15 +2788,13 @@ define i1 @test145(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test145:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -2978,15 +2810,13 @@ define i1 @test146(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test146:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3005,7 +2835,6 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_le_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test147:
@@ -3014,8 +2843,7 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3034,7 +2862,6 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test148:
@@ -3043,8 +2870,7 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3062,15 +2888,13 @@ define i1 @test149(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test149:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3089,7 +2913,6 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test150:
@@ -3098,8 +2921,7 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3117,15 +2939,13 @@ define i1 @test151(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test151:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3143,15 +2963,13 @@ define i1 @test152(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test152:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3170,7 +2988,6 @@ define i1 @test153(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test153:
@@ -3179,8 +2996,7 @@ define i1 @test153(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
index eecc91239c728..279819165f33c 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
@@ -5,20 +5,19 @@ define void @wombat(i1 %cond, ptr addrspace(5) %addr) {
 ; CHECK-LABEL: wombat:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT:    buffer_load_ubyte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %then
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB0_2: ; %end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+
 entry:
   %load = load <1 x i8>, ptr addrspace(5) %addr, align 1
   br i1 %cond, label %then, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index c3a6cd5975a77..53448df79ee27 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -34,19 +34,17 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
 define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i16_to_i1
   ; GCN: bb.0 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3
+  ; GCN-NEXT:   liveins: $vgpr2, $sgpr4_sgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
-  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
   ; GCN-NEXT:   [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY1]], 0, 16, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_BFE_I32_e64_]], killed [[S_MOV_B32_]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %setcc = icmp slt i16 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
@@ -86,18 +84,16 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x
 define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i32_to_i1
   ; GCN: bb.0 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3
+  ; GCN-NEXT:   liveins: $vgpr2, $sgpr4_sgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
-  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY1]], killed [[S_MOV_B32_]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %setcc = icmp slt i32 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
@@ -141,21 +137,19 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
 define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i64_to_i1
   ; GCN: bb.0 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4
+  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $sgpr4_sgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
-  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
   ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE]], [[COPY3]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], [[COPY]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %setcc = icmp slt i64 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
diff --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 72ee660dc2adb..02a3066822e51 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -29,6 +29,8 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
 ; CHECK-NEXT:    buffer_store_byte v2, off, s[0:3], s32 offset:1
 ; CHECK-NEXT:    buffer_load_ubyte v0, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %val = load <8 x i1>, ptr %ptr
   %ret = extractelement <8 x i1> %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index b5440b9c38c9f..fdf060ce5c24e 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2835,10 +2835,8 @@ define float @v_fneg_select_infloop_regression_f32(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = fneg float %i
@@ -2850,10 +2848,8 @@ define float @v_fneg_select_infloop_regression_f32_commute0(float %arg, i1 %arg1
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 0.0
   %i2 = fneg float %i
@@ -2865,10 +2861,8 @@ define float @v_fneg_select_infloop_regression_f32_commute1(float %arg, i1 %arg1
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = fneg float %i
@@ -2880,10 +2874,8 @@ define float @v_fneg_select_infloop_regression_f32_commute2(float %arg, i1 %arg1
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 0.0
   %i2 = fneg float %i
@@ -2896,10 +2888,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32(float %arg, i1 %ar
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 2.0, float %arg
   %i2 = fneg float %i
@@ -2911,10 +2901,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute0(float %ar
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 2.0
   %i2 = fneg float %i
@@ -2926,10 +2914,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute1(float %ar
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 2.0, float %arg
   %i2 = fneg float %i
@@ -2941,10 +2927,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute2(float %ar
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 2.0
   %i2 = fneg float %i
@@ -2957,10 +2941,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32(float %arg, i1
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float -2.0, float %arg
   %i2 = fneg float %i
@@ -2972,10 +2954,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0(float
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float -2.0
   %i2 = fneg float %i
@@ -2987,10 +2967,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1(float
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float -2.0, float %arg
   %i2 = fneg float %i
@@ -3002,10 +2980,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float -2.0
   %i2 = fneg float %i
@@ -3064,12 +3040,10 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_bfrev_b32_e32 v3, 1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, double 0.0, double %arg
   %i2 = fneg double %i
@@ -3121,21 +3095,17 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_and_b32_e32 v1, 1, v1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_select_infloop_regression_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 1, v1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, half 0.0, half %arg
   %i2 = fneg half %i
@@ -3188,11 +3158,9 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_and_b32_e32 v1, 1, v2
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -3201,11 +3169,9 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
 ; VI-LABEL: v_fneg_select_infloop_regression_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 1, v1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %arg
   %i2 = fneg <2 x half> %i
@@ -3262,13 +3228,11 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1
 ; GCN-LABEL: v_fneg_select_infloop_regression_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_bfrev_b32_e32 v3, 1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg
   %i2 = fneg <2 x float> %i
@@ -3315,10 +3279,8 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fabs_select_infloop_regression_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = call float @llvm.fabs.f32(float %i)
@@ -3366,10 +3328,8 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_fabs_select_infloop_regression:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v0|, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v0|, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = call float @llvm.fabs.f32(float %i)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index cd1ec85eb8d0f..3680c416cd43f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -7,18 +7,13 @@ define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v1, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v1, -v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 %arg0, i32 %arg1
   %fneg = xor i32 %select, -2147483648
@@ -57,10 +52,8 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
 ; GFX7-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    flat_store_dword v[3:4], v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX7-NEXT:    flat_store_dword v[2:3], v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -68,10 +61,8 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
 ; GFX9-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    global_store_dword v[3:4], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -79,12 +70,10 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
 ; GFX11-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
-; GFX11-NEXT:    global_store_b32 v[3:4], v1, off
+; GFX11-NEXT:    global_store_b32 v[2:3], v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 %arg0, i32 %arg1
   store i32 %select, ptr addrspace(1) %ptr
@@ -96,20 +85,15 @@ define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, -v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, -v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
   %fneg = xor i64 %select, 9223372036854775808
@@ -152,19 +136,15 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
@@ -231,10 +211,8 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 ; GFX7-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    flat_store_short v[3:4], v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX7-NEXT:    flat_store_short v[2:3], v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -242,10 +220,8 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 ; GFX9-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    global_store_short v[3:4], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -253,12 +229,10 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 ; GFX11-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v1
-; GFX11-NEXT:    global_store_b16 v[3:4], v1, off
+; GFX11-NEXT:    global_store_b16 v[2:3], v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
   store i16 %select, ptr addrspace(1) %ptr
@@ -270,38 +244,34 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
 ; GFX7-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[5:6], v[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX7-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v3, -v6, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX9-NEXT:    global_store_dwordx2 v[5:6], v[0:1], off
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v3, -v6, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, -v4, -v2, vcc_lo
-; GFX11-NEXT:    global_store_b64 v[5:6], v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, -v3, -v6, s0
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
@@ -314,30 +284,21 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
 ; GCN-LABEL: select_fneg_xor_select_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i32 %arg0, -2147483648
   %select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -350,25 +311,16 @@ define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %a
 ; GCN-LABEL: select_fneg_select_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg float %arg0
   %select0 = select i1 %cond0, float %arg1, float %fneg0
@@ -381,20 +333,15 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
 ; GCN-LABEL: fneg_xor_select_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, -v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, -v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, double %arg0, double %arg1
   %fneg = fneg double %select
@@ -405,12 +352,9 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX7-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[5:6], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX7-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -418,12 +362,9 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX9-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX9-NEXT:    global_store_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -431,13 +372,11 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX11-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
-; GFX11-NEXT:    global_store_b64 v[5:6], v[0:1], off
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, double %arg0, double %arg1
@@ -450,21 +389,18 @@ define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %a
 ; GCN-LABEL: fneg_xor_select_i64_user_with_srcmods:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT:    v_add_f64 v[0:1], -v[1:2], 2.0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT:    v_add_f64 v[0:1], -v[0:1], 2.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64_user_with_srcmods:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_add_f64 v[0:1], -v[1:2], 2.0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], -v[0:1], 2.0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
   %fneg = xor i64 %select, 9223372036854775808
@@ -477,32 +413,23 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
 ; GCN-LABEL: select_fneg_select_fneg_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_fneg_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg double %arg0
   %select0 = select i1 %cond0, double %arg1, double %fneg0
@@ -515,32 +442,23 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
 ; GCN-LABEL: select_fneg_xor_select_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i64 %arg0, 9223372036854775808
   %select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
@@ -553,45 +471,32 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
 ; GFX7-LABEL: select_fneg_select_f16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e64 v2, -v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, s[6:7]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: select_fneg_select_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg half %arg0
   %select0 = select i1 %cond0, half %arg1, half %fneg0
@@ -604,30 +509,21 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
 ; GCN-LABEL: select_fneg_xor_select_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i16 %arg0, -32768
   %select0 = select i1 %cond0, i16 %arg1, i16 %fneg0

>From ad7d65712f860ddee8413fa75b11f820b02681d3 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 19 Feb 2024 18:20:19 -0600
Subject: [PATCH 10/25] Updated calling conv such that inreg i1 is promoted to
 i32 before being allocated.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |   9 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     |   3 -
 llvm/test/CodeGen/AMDGPU/function-args.ll     | 192 +++++++++---------
 3 files changed, 103 insertions(+), 101 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 0a197e4a786cc..8dd1daa642f9f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,16 +187,17 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
 // Calling convention for leaf functions
 def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
+  CCIfType<[i1], CCIfInReg<CCPromoteToType<i32>>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
-
-  CCIfType<[i1], CCPromoteToType<i32>>,
-
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
 
+  CCIfType<[i1], CCCustom<"CC_AMDGPU_Custom_I1">>,
+
+  CCIfType<[i1], CCPromoteToType<i32>>,
+
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1, bf16, v2bf16], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 44a9127b4bd09..9871b89431cd0 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1793,9 +1793,6 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s2, s6
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 9fca84ef2667c..530e439ae572a 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2778,11 +2778,16 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; CI-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2791,13 +2796,9 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v18, v20
-; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v19, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2824,8 +2825,9 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
 ; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:4
-; VI-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2834,7 +2836,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2842,7 +2844,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2863,8 +2865,9 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2873,7 +2876,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2881,22 +2884,23 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:16
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -4892,51 +4896,51 @@ define void @many_i1_args(
 ; GFX9:      ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 vcc, -1
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, vcc
-; GFX9-NEXT:    v_writelane_b32 v19, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v19, s31, 1
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v20, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v20, s31, 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s31, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s30, -1
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[8:9]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[10:11]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[10:11]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[12:13]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[12:13]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[14:15]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[14:15]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[16:17]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[16:17]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[18:19]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[18:19]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[20:21]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[20:21]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[22:23]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[22:23]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[24:25]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[24:25]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[26:27]
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[26:27]
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[28:29]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[28:29]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4994,10 +4998,10 @@ define void @many_i1_args(
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v19, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v19, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5006,23 +5010,25 @@ define void @many_i1_args(
 ; GFX11:      ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s32          ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v7, s32          ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, vcc_lo
-; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT:    v_writelane_b32 v7, s30, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s1
 ; GFX11-NEXT:    s_mov_b32 s30, -1
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT:    v_writelane_b32 v7, s31, 1
 ; GFX11-NEXT:    s_mov_b32 s31, 0x31016000
+; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
 ; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5031,13 +5037,13 @@ define void @many_i1_args(
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s11
+; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s7
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s9
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s11
 ; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5046,13 +5052,13 @@ define void @many_i1_args(
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s16
+; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s12
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s13
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s14
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s15
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s16
 ; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5061,13 +5067,13 @@ define void @many_i1_args(
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s21
+; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s17
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s18
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s19
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s20
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s21
 ; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5076,13 +5082,13 @@ define void @many_i1_args(
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s26
+; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s22
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s24
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s25
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s26
 ; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5091,27 +5097,25 @@ define void @many_i1_args(
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s27
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s28
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s29
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s29
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v1, off, s[28:31], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v7, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v7, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v2, off, s32           ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v7, off, s32           ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]

>From d841a49e6b9d06c9b91bdae02e42e6561329facb Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Sun, 10 Mar 2024 19:07:38 -0500
Subject: [PATCH 11/25] Add an additional CopyToReg and CopyFromReg for the
 CopyFromReg for the i1 return value.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  | 33 +++++++++++++++++++++-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp |  8 ------
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0fdb3c4e36c67..a40fec97d6b91 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3239,6 +3239,21 @@ SDValue SITargetLowering::LowerCallResult(
       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
       Chain = Val.getValue(1);
       InGlue = Val.getValue(2);
+
+      // For i1 return value allocated to an SGPR, we want the dst reg for the
+      // above CopyFromReg not to be of VReg_1 when emitting machine code.
+      // This requires creating an addional CopyToReg followed by another
+      // CopyFromReg.
+      if (RVLocs.size() == 1 && VA.getLocVT() == MVT::i1) {
+        const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+        MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+
+        if (TRI->isSGPRReg(MRI, VA.getLocReg())) {
+          Register TmpVReg = MRI.createVirtualRegister(TRI->getBoolRC());
+          SDValue TmpCopyTo = DAG.getCopyToReg(Chain, DL, TmpVReg, Val);
+          Val = DAG.getCopyFromReg(TmpCopyTo, DL, TmpVReg, MVT::i1);
+        }
+      }
     } else if (VA.isMemLoc()) {
       report_fatal_error("TODO: return values in memory");
     } else
@@ -15995,6 +16010,21 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
   return false;
 }
 
+LLVM_ATTRIBUTE_UNUSED
+static bool isCopyFromRegForI1Return(const SDNode *N) {
+  assert(N->getOpcode() == ISD::CopyFromReg);
+  SDNode *N1 = N->getOperand(0).getNode();
+  if (N1->getOpcode() != ISD::CopyToReg)
+    return false;
+  SDNode *N2 = N1->getOperand(0).getNode();
+  if (N2->getOpcode() != ISD::CopyFromReg)
+    return false;
+  SDNode *N3 = N2->getOperand(0).getNode();
+  if (N3->getOpcode() != ISD::CALLSEQ_END)
+    return false;
+  return true;
+}
+
 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
                                                   FunctionLoweringInfo *FLI,
                                                   UniformityInfo *UA) const {
@@ -16012,7 +16042,8 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
     if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
       return UA->isDivergent(V);
 
-    assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
+    assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N) ||
+           isCopyFromRegForI1Return(N));
     return !TRI->isSGPRReg(MRI, Reg);
   }
   case ISD::LOAD: {
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index a04ce16cbddb6..32dad0c425c04 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -689,14 +689,6 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
       assert(!MI.getOperand(1).getSubReg());
 
       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
-        if (!SrcReg.isVirtual() &&
-            TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
-          // When calling convention allocates SGPR for i1, for GPUs with
-          // wavefront size 64, i1 return value is put in 64b SGPR.
-          assert(ST->isWave64());
-          continue;
-        }
-
         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
         Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)

>From df1bbe3c23080e39486bc0ad6be2a71092877d02 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Sun, 10 Mar 2024 19:18:55 -0500
Subject: [PATCH 12/25] Revert a formatting change made by clang-format.

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3db884b78e007..08351c49b2231 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -866,7 +866,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+            .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
@@ -881,13 +881,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     if (DestReg == AMDGPU::VCC) {
       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
-            .addReg(SrcReg, getKillRegState(KillSrc));
+          .addReg(SrcReg, getKillRegState(KillSrc));
       } else {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
-            .addImm(0)
-            .addReg(SrcReg, getKillRegState(KillSrc));
+          .addImm(0)
+          .addReg(SrcReg, getKillRegState(KillSrc));
       }
 
       return;

>From 4c098fed1b45576c60bed18ca1cdc6697f452a07 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 21 Mar 2024 18:51:58 -0500
Subject: [PATCH 13/25] This commit: (1) fixed i1 array as func return (2)
 fixed i1 return when GlobalISel is used (3) zeroext/signext in i1 return is
 ignored (4) inreg return of i1 is treated as i32 (5) new test files.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  12 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   3 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  19 +-
 .../GlobalISel/function-call-i1-return.ll     | 294 +++++++
 .../AMDGPU/GlobalISel/function-i1-args.ll     | 569 ++++++++++++
 .../AMDGPU/GlobalISel/function-returns.ll     |  22 +-
 .../GlobalISel/irtranslator-function-args.ll  | 209 +----
 llvm/test/CodeGen/AMDGPU/function-args.ll     | 329 ++-----
 .../CodeGen/AMDGPU/function-call-i1-return.ll | 198 +++++
 llvm/test/CodeGen/AMDGPU/function-i1-args.ll  | 819 ++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |  12 +-
 12 files changed, 1996 insertions(+), 492 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/function-i1-args.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 53dbae7765803..2d25827906f15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -73,7 +73,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
     if (TRI->isSGPRReg(MRI, PhysReg)) {
       LLT Ty = MRI.getType(ExtReg);
       LLT S32 = LLT::scalar(32);
-      if (Ty != S32) {
+      if (Ty != S32 && Ty != LLT::scalar(64)) {
         // FIXME: We should probably support readfirstlane intrinsics with all
         // legal 32-bit types.
         assert(Ty.getSizeInBits() == 32);
@@ -88,6 +88,9 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
                                         {MRI.getType(ExtReg)})
                         .addReg(ExtReg);
       ExtReg = ToSGPR.getReg(0);
+      if (VA.getLocVT() == MVT::i1 &&
+          MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
+        ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ExtReg).getReg(0);
     }
 
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -127,10 +130,9 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
       unsigned CopyToBits = 32;
 
       // When function return type is i1, it may be in a 64b register.
-      if (VA.getLocVT() == MVT::i1) {
-        if (MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
-          CopyToBits = 64;
-      }
+      if (VA.getLocVT() == MVT::i1 &&
+          MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
+        CopyToBits = 64;
 
       auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 8dd1daa642f9f..2c356731bf995 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -209,6 +209,8 @@ def CC_AMDGPU_Func : CallingConv<[
 // Calling convention for leaf functions
 def RetCC_AMDGPU_Func : CallingConv<[
   CCIfType<[i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i1], CCIfInReg<CCPromoteToType<i32>>>,
+
   CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
 
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 02cb248836df1..e117395eb699a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -816,6 +816,9 @@ EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
                                               ISD::NodeType ExtendKind) const {
   assert(!VT.isVector() && "only scalar expected");
 
+  if (VT == MVT::i1)
+    return MVT::i1;
+
   // Round to the next multiple of 32-bits.
   unsigned Size = VT.getSizeInBits();
   if (Size <= 32)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a40fec97d6b91..618fdd95f4a4b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3240,11 +3240,13 @@ SDValue SITargetLowering::LowerCallResult(
       Chain = Val.getValue(1);
       InGlue = Val.getValue(2);
 
-      // For i1 return value allocated to an SGPR, we want the dst reg for the
-      // above CopyFromReg not to be of VReg_1 when emitting machine code.
-      // This requires creating an addional CopyToReg followed by another
+      // For i1 return value allocated to an SGPR, the following is a
+      // workaround before SILowerI1Copies is fixed. Basically we want the
+      // dst reg for the above CopyFromReg not to be of the VReg_1 class
+      // when emitting machine code. This workaround creats an addional
+      // CopyToReg with a new virtual register, followed by another
       // CopyFromReg.
-      if (RVLocs.size() == 1 && VA.getLocVT() == MVT::i1) {
+      if (VA.getLocVT() == MVT::i1) {
         const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
         MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
 
@@ -16019,7 +16021,14 @@ static bool isCopyFromRegForI1Return(const SDNode *N) {
   SDNode *N2 = N1->getOperand(0).getNode();
   if (N2->getOpcode() != ISD::CopyFromReg)
     return false;
-  SDNode *N3 = N2->getOperand(0).getNode();
+
+  // Possibly multiple CopyFromReg nodes before getting to CALLSEQ_END,
+  // e.g., when the return value is an array.
+  SDNode *N3 = N2;
+  do {
+    N3 = N3->getOperand(0).getNode();
+  } while (N3->getOpcode() == ISD::CopyFromReg);
+
   if (N3->getOpcode() != ISD::CALLSEQ_END)
     return false;
   return true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
new file mode 100644
index 0000000000000..24a51a9904d25
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
+
+define i1 @i1_func_void() {
+; GFX9-LABEL: name: i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX9-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+; GFX9-NEXT:    SI_RETURN implicit $sgpr0_sgpr1
+;
+; GFX11-LABEL: name: i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[INTRIN]](s32)
+; GFX11-NEXT:    SI_RETURN implicit $sgpr0
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_i1_func_void() {
+; GFX9-LABEL: name: test_call_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit-def $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %val = call i1 @i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define zeroext i1 @zeroext_i1_func_void() {
+; GFX9-LABEL: name: zeroext_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX9-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+; GFX9-NEXT:    SI_RETURN implicit $sgpr0_sgpr1
+;
+; GFX11-LABEL: name: zeroext_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[INTRIN]](s32)
+; GFX11-NEXT:    SI_RETURN implicit $sgpr0
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_zeroext_i1_func_void() {
+; GFX9-LABEL: name: test_call_zeroext_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_zeroext_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %val = call i1 @zeroext_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define signext i1 @signext_i1_func_void() {
+; GFX9-LABEL: name: signext_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX9-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+; GFX9-NEXT:    SI_RETURN implicit $sgpr0_sgpr1
+;
+; GFX11-LABEL: name: signext_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[INTRIN]](s32)
+; GFX11-NEXT:    SI_RETURN implicit $sgpr0
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_signext_i1_func_void() {
+; GFX9-LABEL: name: test_call_signext_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_signext_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %val = call i1 @signext_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define inreg i1 @inreg_i1_func_void() {
+; GFX9-LABEL: name: inreg_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT:    $vgpr0 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT:    SI_RETURN implicit $vgpr0
+;
+; GFX11-LABEL: name: inreg_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    $vgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    SI_RETURN implicit $vgpr0
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_inreg_i1_func_void() {
+; GFX9-LABEL: name: test_call_inreg_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @inreg_i1_func_void
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @inreg_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_inreg_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @inreg_i1_func_void
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @inreg_i1_func_void, csr_amdgpu, implicit-def $vgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %val = call i1 @inreg_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define [2 x i1] @a2i1_func_void() {
+; GFX9-LABEL: name: a2i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX9-NEXT:    [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX9-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+; GFX9-NEXT:    [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s1)
+; GFX9-NEXT:    [[INTRIN2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT3]](s32)
+; GFX9-NEXT:    [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN2]](s32)
+; GFX9-NEXT:    $sgpr2_sgpr3 = COPY [[ANYEXT4]](s64)
+; GFX9-NEXT:    SI_RETURN implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3
+;
+; GFX11-LABEL: name: a2i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX11-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX11-NEXT:    [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[INTRIN]](s32)
+; GFX11-NEXT:    [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s1)
+; GFX11-NEXT:    [[INTRIN2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT3]](s32)
+; GFX11-NEXT:    $sgpr1 = COPY [[INTRIN2]](s32)
+; GFX11-NEXT:    SI_RETURN implicit $sgpr0, implicit $sgpr1
+  %val = load [2 x i1], ptr addrspace(1) undef
+  ret [2 x i1] %val
+}
+
+define void @test_call_a2i1_func_void() {
+; GFX9-LABEL: name: test_call_a2i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a2i1_func_void
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1, implicit-def $sgpr2_sgpr3
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_a2i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a2i1_func_void
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit-def $sgpr0, implicit-def $sgpr1
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX11-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %val = call [2 x i1] @a2i1_func_void()
+  store volatile [2 x i1] %val, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
new file mode 100644
index 0000000000000..f4c85df0e0a1b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -0,0 +1,569 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
+
+define void @void_func_i1(i1 %arg0) {
+; GFX9-LABEL: name: void_func_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:   liveins: $sgpr0
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1() {
+; GFX9-LABEL: name: test_call_void_func_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1  
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1  
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr0
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1(i1 %val)
+  ret void
+}
+
+define void @void_func_i1_zeroext(i1 zeroext %arg0) {
+; GFX9-LABEL: name: void_func_i1_zeroext
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr4_sgpr5
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+; GFX9-NEXT:    [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[CONST]]
+; GFX9-NEXT:    G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_zeroext
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+; GFX11-NEXT:    [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[CONST]]
+; GFX11-NEXT:    G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %ext = zext i1 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1_zeroext() {
+; GFX9-LABEL: name: test_call_void_func_i1_zeroext
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_zeroext 
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_zeroext
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_zeroext 
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_zeroext(i1 %val)
+  ret void
+}
+
+define void @void_func_i1_signext(i1 signext %arg0) {
+; GFX9-LABEL: name: void_func_i1_signext
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr4_sgpr5
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+; GFX9-NEXT:    [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[CONST]]
+; GFX9-NEXT:    G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_signext
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+; GFX11-NEXT:    [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[CONST]]
+; GFX11-NEXT:    G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %ext = sext i1 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1_signext() {
+; GFX9-LABEL: name: test_call_void_func_i1_signext
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_signext 
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_signext
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_signext 
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr0
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_signext(i1 %val)
+  ret void
+}
+
+define void @void_func_a2i1([2 x i1] %arg0) {
+; GFX9-LABEL: name: void_func_a2i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_a2i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX11-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store [2 x i1] %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_a2i1() {
+; GFX9-LABEL: name: test_call_void_func_a2i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[CONST1:%[0-9]+]]:_(s1) = G_CONSTANT i1 false  
+; GFX9-NEXT:    [[CONST2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_a2i1 
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[CONST1]](s1)
+; GFX9-NEXT:    $sgpr2_sgpr3 = COPY [[CONST2]](s1)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_a2i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[CONST1:%[0-9]+]]:_(s1) = G_CONSTANT i1 false  
+; GFX11-NEXT:    [[CONST2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_a2i1 
+; GFX11-NEXT:    [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST1]](s1)
+; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT1]](s32)
+; GFX11-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST2]](s1)
+; GFX11-NEXT:    $sgpr1 = COPY [[ANYEXT2]](s32)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+  %1 = insertvalue [2 x i1] undef, i1 0, 0
+  %2 = insertvalue [2 x i1] %1, i1 1, 1
+  call void @void_func_a2i1([2 x i1] %2)
+  ret void
+}
+
+define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
+; GFX9-LABEL: name: void_func_i1_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1_i1() {
+; GFX9-LABEL: name: test_call_void_func_i1_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_i1
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr2_sgpr3 = COPY [[CONST]](s1)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_i1
+; GFX11-NEXT:    [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT1]](s32)
+; GFX11-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX11-NEXT:    $sgpr1 = COPY [[ANYEXT2]](s32)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_i1(i1 %val, i1 true)
+  ret void
+}
+
+define void @many_i1_args(
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: name: many_i1_args
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; GFX9-NEXT:    [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; GFX9-NEXT:    [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; GFX9-NEXT:    [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
+; GFX9-NEXT:    [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
+; GFX9-NEXT:    [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
+; GFX9-NEXT:    [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+; GFX9-NEXT:    [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
+; GFX9-NEXT:    [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
+; GFX9-NEXT:    [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
+; GFX9-NEXT:    [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
+; GFX9-NEXT:    [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
+; GFX9-NEXT:    [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
+; GFX9-NEXT:    [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
+; GFX9-NEXT:    [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
+; GFX9-NEXT:    [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
+; GFX9-NEXT:    [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
+; GFX9-NEXT:    [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
+; GFX9-NEXT:    [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
+; GFX9-NEXT:    [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; GFX9-NEXT:    [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT:    [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX9-NEXT:    [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX9-NEXT:    [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX9-NEXT:    [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr2
+; GFX9-NEXT:    [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX9-NEXT:    [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr3
+; GFX9-NEXT:    [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX9-NEXT:    [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr4
+; GFX9-NEXT:    [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX9-NEXT:    [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr5
+; GFX9-NEXT:    [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX9-NEXT:    [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr6
+; GFX9-NEXT:    [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX9-NEXT:    [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr7
+; GFX9-NEXT:    [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX9-NEXT:    [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr8
+; GFX9-NEXT:    [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX9-NEXT:    [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr9
+; GFX9-NEXT:    [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX9-NEXT:    [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr10
+; GFX9-NEXT:    [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX9-NEXT:    [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr11
+; GFX9-NEXT:    [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX9-NEXT:    [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr12
+; GFX9-NEXT:    [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX9-NEXT:    [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr13
+; GFX9-NEXT:    [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX9-NEXT:    [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr14
+; GFX9-NEXT:    [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX9-NEXT:    [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr15
+; GFX9-NEXT:    [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX9-NEXT:    [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr16
+; GFX9-NEXT:    [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX9-NEXT:    [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr17
+; GFX9-NEXT:    [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX9-NEXT:    [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr18
+; GFX9-NEXT:    [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX9:         G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+;
+; GFX11-LABEL: name: many_i1_args
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i1 %arg2, ptr addrspace(1) undef
+  store volatile i1 %arg3, ptr addrspace(1) undef
+  store volatile i1 %arg4, ptr addrspace(1) undef
+  store volatile i1 %arg5, ptr addrspace(1) undef
+  store volatile i1 %arg6, ptr addrspace(1) undef
+  store volatile i1 %arg7, ptr addrspace(1) undef
+
+  store volatile i1 %arg8, ptr addrspace(1) undef
+  store volatile i1 %arg9, ptr addrspace(1) undef
+  store volatile i1 %arg10, ptr addrspace(1) undef
+  store volatile i1 %arg11, ptr addrspace(1) undef
+  store volatile i1 %arg12, ptr addrspace(1) undef
+  store volatile i1 %arg13, ptr addrspace(1) undef
+  store volatile i1 %arg14, ptr addrspace(1) undef
+  store volatile i1 %arg15, ptr addrspace(1) undef
+
+  store volatile i1 %arg16, ptr addrspace(1) undef
+  store volatile i1 %arg17, ptr addrspace(1) undef
+  store volatile i1 %arg18, ptr addrspace(1) undef
+  store volatile i1 %arg19, ptr addrspace(1) undef
+  store volatile i1 %arg20, ptr addrspace(1) undef
+  store volatile i1 %arg21, ptr addrspace(1) undef
+  store volatile i1 %arg22, ptr addrspace(1) undef
+  store volatile i1 %arg23, ptr addrspace(1) undef
+
+  store volatile i1 %arg24, ptr addrspace(1) undef
+  store volatile i1 %arg25, ptr addrspace(1) undef
+  store volatile i1 %arg26, ptr addrspace(1) undef
+  store volatile i1 %arg27, ptr addrspace(1) undef
+  store volatile i1 %arg28, ptr addrspace(1) undef
+  store volatile i1 %arg29, ptr addrspace(1) undef
+  store volatile i1 %arg30, ptr addrspace(1) undef
+  store volatile i1 %arg31, ptr addrspace(1) undef
+
+  ret void
+}
+
+define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: name: void_func_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr6, $sgpr4_sgpr5
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
+; GFX9-LABEL: name: void_func_i1_inreg_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr4, $sgpr6_sgpr7
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_inreg_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index e6c835fa25406..117a654d853f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -9,8 +9,10 @@ define i1 @i1_func_void() #0 {
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+  ; CHECK-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
@@ -20,9 +22,11 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK-NEXT:   [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+  ; CHECK-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
@@ -32,9 +36,11 @@ define signext i1 @i1_signext_func_void() #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK-NEXT:   [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+  ; CHECK-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index d239b7271dd89..eece4397d1855 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -3,7 +3,7 @@
 ; the frame info, so some functions have manually added stack object
 ; checks.
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=GFX1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
 ; FIXME: pre-VI should have same ABI without legal i16 operations.
 
 define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
@@ -42,6 +42,16 @@ define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
+  ;
+  ; GFX11-LABEL: name: void_func_i1
+  ; GFX11: bb.1 (%ir-block.0):
+  ; GFX11-NEXT:   liveins: $sgpr0
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX11-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GFX11-NEXT:   SI_RETURN
   store i1 %arg0, ptr addrspace(1) undef
   ret void
 }
@@ -2781,8 +2791,8 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -3232,199 +3242,6 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
   ret void
 }
 
-; Check calling convention for i1 args
-define void @many_i1_args(
-  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
-  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
-  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
-  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
-; CHECK-LABEL: name: many_i1_args
-; CHECK: bb.1 (%ir-block.0):
-; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
-; CHECK-NEXT: {{  $}}
-; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
-; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
-; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
-; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
-; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
-; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
-; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
-; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
-; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
-; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
-; CHECK-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
-; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr0
-; CHECK-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr1
-; CHECK-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr2
-; CHECK-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr3
-; CHECK-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr4
-; CHECK-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr5
-; CHECK-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr6
-; CHECK-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr7
-; CHECK-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr8
-; CHECK-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr9
-; CHECK-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr10
-; CHECK-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr11
-; CHECK-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr12
-; CHECK-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr13
-; CHECK-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr14
-; CHECK-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr15
-; CHECK-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; CHECK-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr16
-; CHECK-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; CHECK-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr17
-; CHECK-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; CHECK-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr18
-; CHECK-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; CHECK-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr19
-; CHECK-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; CHECK-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr20
-; CHECK-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr21
-; CHECK-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr22
-; CHECK-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr23
-; CHECK-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr24
-; CHECK-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
-;
-; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; G_STOREs to TRUNC1-TRUNC30 omitted
-; CHECK:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-;
-; GFX11-LABEL: name: many_i1_args
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
-; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr17
-; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
-; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr18
-; GFX11-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
-; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr19
-; GFX11-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
-; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr20
-; GFX11-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
-; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr21
-; GFX11-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
-; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr22
-; GFX11-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
-; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr23
-; GFX11-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr24
-; GFX11-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr25
-; GFX11-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr26
-; GFX11-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr27
-; GFX11-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr28
-; GFX11-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr29
-; GFX11-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr0
-; GFX11-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr1
-; GFX11-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr2
-; GFX11-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr3
-; GFX11-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr4
-; GFX11-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr5
-; GFX11-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr6
-; GFX11-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr7
-; GFX11-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr8
-; GFX11-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr9
-; GFX11-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr10
-; GFX11-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr11
-; GFX11-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr12
-; GFX11-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr13
-; GFX11-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr14
-; GFX11-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr15
-; GFX11-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr16
-; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr17
-; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
-;
-; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; G_STOREs to TRUNC1-TRUNC30 omitted
-; GFX11:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-
-  store volatile i1 %arg0, ptr addrspace(1) undef
-  store volatile i1 %arg1, ptr addrspace(1) undef
-  store volatile i1 %arg2, ptr addrspace(1) undef
-  store volatile i1 %arg3, ptr addrspace(1) undef
-  store volatile i1 %arg4, ptr addrspace(1) undef
-  store volatile i1 %arg5, ptr addrspace(1) undef
-  store volatile i1 %arg6, ptr addrspace(1) undef
-  store volatile i1 %arg7, ptr addrspace(1) undef
-
-  store volatile i1 %arg8, ptr addrspace(1) undef
-  store volatile i1 %arg9, ptr addrspace(1) undef
-  store volatile i1 %arg10, ptr addrspace(1) undef
-  store volatile i1 %arg11, ptr addrspace(1) undef
-  store volatile i1 %arg12, ptr addrspace(1) undef
-  store volatile i1 %arg13, ptr addrspace(1) undef
-  store volatile i1 %arg14, ptr addrspace(1) undef
-  store volatile i1 %arg15, ptr addrspace(1) undef
-
-  store volatile i1 %arg16, ptr addrspace(1) undef
-  store volatile i1 %arg17, ptr addrspace(1) undef
-  store volatile i1 %arg18, ptr addrspace(1) undef
-  store volatile i1 %arg19, ptr addrspace(1) undef
-  store volatile i1 %arg20, ptr addrspace(1) undef
-  store volatile i1 %arg21, ptr addrspace(1) undef
-  store volatile i1 %arg22, ptr addrspace(1) undef
-  store volatile i1 %arg23, ptr addrspace(1) undef
-
-  store volatile i1 %arg24, ptr addrspace(1) undef
-  store volatile i1 %arg25, ptr addrspace(1) undef
-  store volatile i1 %arg26, ptr addrspace(1) undef
-  store volatile i1 %arg27, ptr addrspace(1) undef
-  store volatile i1 %arg28, ptr addrspace(1) undef
-  store volatile i1 %arg29, ptr addrspace(1) undef
-  store volatile i1 %arg30, ptr addrspace(1) undef
-  store volatile i1 %arg31, ptr addrspace(1) undef
-
-  ret void
-}
-
 attributes #0 = { nounwind }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 530e439ae572a..d6acf82318cee 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -149,6 +149,54 @@ bb2:
   ret void
 }
 
+define void @void_func_v2i1(<2 x i1> %arg0) #0 {
+; GFX9-LABEL: void_func_v2i1:
+; GFX9:          ; %bb.0:
+; GFX9-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:       v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT:       v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:       v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:       v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:       s_mov_b32 s7, 0xf000
+; GFX9-NEXT:       s_mov_b32 s6, -1
+; GFX9-NEXT:       buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+
+  store <2 x i1> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_a2i1([2 x i1] %arg0) {
+; GFX9-LABEL: void_func_a2i1:
+; GFX9:        ; %bb.0:
+; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:     v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT:     s_mov_b32 s7, 0xf000
+; GFX9-NEXT:     s_mov_b32 s6, -1
+; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:     buffer_store_byte v1, off, s[4:7], 0
+; GFX9-NEXT:     buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:     s_setpc_b64 s[30:31]
+
+  store [2 x i1] %arg0, ptr addrspace(1) undef
+  ret void
+}
+
 define void @void_func_i8(i8 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i8:
 ; CIGFX89:       ; %bb.0:
@@ -2780,14 +2828,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
 ; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
 ; CI-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2802,9 +2847,12 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
 ; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v1, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4887,275 +4935,4 @@ define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 {
   ret void
 }
 
-define void @many_i1_args(
-  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
-  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
-  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
-  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
-; GFX9-LABEL: many_i1_args:
-; GFX9:      ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_xor_saveexec_b64 vcc, -1
-; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, vcc
-; GFX9-NEXT:    v_writelane_b32 v20, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v20, s31, 1
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s31, 0xf000
-; GFX9-NEXT:    s_mov_b32 s30, -1
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[8:9]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[10:11]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[12:13]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[14:15]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[16:17]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[18:19]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[20:21]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[22:23]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[24:25]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[26:27]
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[28:29]
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
-; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: many_i1_args:
-; GFX11:      ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GFX11-NEXT:    scratch_store_b32 off, v7, s32          ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, vcc_lo
-; GFX11-NEXT:    v_writelane_b32 v7, s30, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s1
-; GFX11-NEXT:    s_mov_b32 s30, -1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
-; GFX11-NEXT:    v_writelane_b32 v7, s31, 1
-; GFX11-NEXT:    s_mov_b32 s31, 0x31016000
-; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s8
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s9
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s11
-; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s12
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s13
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s14
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s15
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s16
-; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s17
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s18
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s19
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s20
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s21
-; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s22
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s24
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s25
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s26
-; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s27
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s28
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s29
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v1, off, s[28:31], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v7, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v7, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32           ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store volatile i1 %arg0, ptr addrspace(1) undef
-  store volatile i1 %arg1, ptr addrspace(1) undef
-  store volatile i1 %arg2, ptr addrspace(1) undef
-  store volatile i1 %arg3, ptr addrspace(1) undef
-  store volatile i1 %arg4, ptr addrspace(1) undef
-  store volatile i1 %arg5, ptr addrspace(1) undef
-  store volatile i1 %arg6, ptr addrspace(1) undef
-  store volatile i1 %arg7, ptr addrspace(1) undef
-
-  store volatile i1 %arg8, ptr addrspace(1) undef
-  store volatile i1 %arg9, ptr addrspace(1) undef
-  store volatile i1 %arg10, ptr addrspace(1) undef
-  store volatile i1 %arg11, ptr addrspace(1) undef
-  store volatile i1 %arg12, ptr addrspace(1) undef
-  store volatile i1 %arg13, ptr addrspace(1) undef
-  store volatile i1 %arg14, ptr addrspace(1) undef
-  store volatile i1 %arg15, ptr addrspace(1) undef
-
-  store volatile i1 %arg16, ptr addrspace(1) undef
-  store volatile i1 %arg17, ptr addrspace(1) undef
-  store volatile i1 %arg18, ptr addrspace(1) undef
-  store volatile i1 %arg19, ptr addrspace(1) undef
-  store volatile i1 %arg20, ptr addrspace(1) undef
-  store volatile i1 %arg21, ptr addrspace(1) undef
-  store volatile i1 %arg22, ptr addrspace(1) undef
-  store volatile i1 %arg23, ptr addrspace(1) undef
-
-  store volatile i1 %arg24, ptr addrspace(1) undef
-  store volatile i1 %arg25, ptr addrspace(1) undef
-  store volatile i1 %arg26, ptr addrspace(1) undef
-  store volatile i1 %arg27, ptr addrspace(1) undef
-  store volatile i1 %arg28, ptr addrspace(1) undef
-  store volatile i1 %arg29, ptr addrspace(1) undef
-  store volatile i1 %arg30, ptr addrspace(1) undef
-  store volatile i1 %arg31, ptr addrspace(1) undef
-
-  ret void
-}
-
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
new file mode 100644
index 0000000000000..91c739701a1a8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+
+define i1 @i1_func_void() {
+; GFX9-LABEL: i1_func_void:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:     global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:     v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:     v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:     s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_func_void:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_i1_func_void() {
+; GFX9-LABEL: test_call_i1_func_void:
+; GFX9:          s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:     global_store_byte v[0:1], v0, off
+;
+; GFX11-LABEL: test_call_i1_func_void:
+; GFX11:         s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+  %val = call i1 @i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define zeroext i1 @zeroext_i1_func_void() {
+; GFX9-LABEL: zeroext_i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: zeroext_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_zeroext_i1_func_void() {
+; GFX9-LABEL: test_call_zeroext_i1_func_void:
+; GFX9:         s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+;
+; GFX11-LABEL: test_call_zeroext_i1_func_void:
+; GFX11:         s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+  %val = call i1 @zeroext_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define signext i1 @signext_i1_func_void() {
+; GFX9-LABEL: signext_i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: signext_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_signext_i1_func_void() {
+; GFX9-LABEL: test_call_signext_i1_func_void:
+; GFX9:          s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:     global_store_byte v[0:1], v0, off
+;
+; GFX11-LABEL: test_call_signext_i1_func_void:
+; GFX11:         s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+  %val = call i1 @signext_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define inreg i1 @inreg_i1_func_void() {
+; GFX9-LABEL: inreg_i1_func_void:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: inreg_i1_func_void:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_inreg_i1_func_void() {
+; GFX9-LABEL: test_call_inreg_i1_func_void:
+; GFX9:         s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+;
+; GFX11-LABEL: test_call_inreg_i1_func_void:
+; GFX11:         s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+  %val = call i1 @inreg_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define [2 x i1] @a2i1_func_void() {
+; GFX9-LABEL: a2i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %val = load [2 x i1], ptr addrspace(1) undef
+  ret [2 x i1] %val
+}
+
+define void @test_call_a2i1_func_void() {
+; GFX9-LABEL: test_call_a2i1_func_void:
+;
+; GFX11:         s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, s1, 0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, s0, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+  %val = call [2 x i1] @a2i1_func_void()
+  store volatile [2 x i1] %val, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
new file mode 100644
index 0000000000000..55f3422e5c834
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -0,0 +1,819 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) {
+; GFX9-LABEL: void_func_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1() {
+; GFX9-LABEL: test_call_void_func_i1:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_i1 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_i1 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1:
+; GFX11:     ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, void_func_i1 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_i1 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1(i1 %val)
+  ret void
+}
+
+define void @void_func_i1_zeroext(i1 zeroext %arg0) {
+; GFX9-LABEL: void_func_i1_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_or_b32_e32 v0, 12, v0
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i1 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1_zeroext() {
+; GFX9-LABEL: test_call_void_func_i1_zeroext:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_i1_zeroext at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_i1_zeroext at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1_zeroext:
+; GFX11:     ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, void_func_i1_zeroext at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_i1_zeroext at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_zeroext(i1 %val)
+  ret void
+}
+
+define void @void_func_i1_signext(i1 signext %arg0) {
+; GFX9-LABEL: void_func_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v0, 12, v0
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = sext i1 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1_signext() {
+; GFX9-LABEL: test_call_void_func_i1_signext:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_i1_signext at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_i1_signext at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1_signext:
+; GFX11:     ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, void_func_i1_signext at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_i1_signext at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_signext(i1 %val)
+  ret void
+}
+
+define void @void_func_a2i1([2 x i1] %arg0) {
+; GFX9-LABEL: void_func_a2i1:
+; GFX9:        ; %bb.0:
+; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:     v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT:     global_store_byte v[0:1], v1, off
+; GFX9-NEXT:     global_store_byte v[0:1], v0, off
+; GFX9-NEXT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:     s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a2i1:
+; GFX11:        ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store [2 x i1] %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_a2i1() {
+; GFX9-LABEL: test_call_void_func_a2i1:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s10, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_a2i1 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_a2i1 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], -1
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+;
+; GFX11-LABEL: test_call_void_func_a2i1:
+; GFX11:     ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT;    s_mov_b32 s4, s33
+; GFX11-NEXT;    s_mov_b32 s33, s32
+; GFX11-NEXT;    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT;    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT;    s_mov_b32 exec_lo, s0
+; GFX11-NEXT;    s_add_i32 s32, s32, 16
+; GFX11-NEXT;    s_getpc_b64 s[0:1]
+; GFX11-NEXT;    s_add_u32 s0, s0, void_func_a2i1 at gotpcrel32@lo+4
+; GFX11-NEXT;    s_addc_u32 s1, s1, void_func_a2i1 at gotpcrel32@hi+12
+; GFX11-NEXT;    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT;    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT;    s_mov_b32 s0, 0
+; GFX11-NEXT;    s_mov_b32 s1, -1
+; GFX11-NEXT;    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT;    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT;    s_swappc_b64 s[30:31], s[2:3]
+  %1 = insertvalue [2 x i1] undef, i1 0, 0
+  %2 = insertvalue [2 x i1] %1, i1 1, 1
+  call void @void_func_a2i1([2 x i1] %2)
+  ret void
+}
+
+define void @i1_arg_i1_use(i1 %arg) {
+; CIGFX89-LABEL: i1_arg_i1_use:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+; GFX9:        ; %bb.0:
+; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:     s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:     s_and_saveexec_b64 s[4:5], s[6:7]
+; GFX9:        ; %bb.1:
+; GFX9-NEXT:     v_mov_b32_e32 v0, 0
+; GFX9-NEXT:     global_store_dword v[0:1], v0, off
+; GFX9-NEXT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:   .LBB{{[0-9]+}}_2:
+; GFX9-NEXT:     s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:     s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_arg_i1_use:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_b32 s1, s0, -1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s1
+; GFX11:       ; %bb.1: ; %bb1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:  .LBB{{[0-9]+}}_2: ; %bb2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  br i1 %arg, label %bb2, label %bb1
+
+bb1:
+  store volatile i32 0, ptr addrspace(1) undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+define void @void_func_v2i1(<2 x i1> %arg0) {
+; GFX9-LABEL: void_func_v2i1:
+; GFX9:          ; %bb.0:
+; GFX9-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:       v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT:       v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:       v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:       v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:       global_store_byte v[0:1], v0, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x i1> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_v2i1() {
+; GFX9-LABEL: test_call_void_func_v2i1:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_v2i1 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_v2i1 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+;
+; GFX11-LABEL: test_call_void_func_v2i1:
+; GFX11:     ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, void_func_v2i1 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_v2i1 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+  %1 = insertelement <2 x i1> undef, i1 0, i32 0
+  %2 = insertelement <2 x i1> %1, i1 1, i32 1
+  call void @void_func_v2i1(<2 x i1> %2)
+  ret void
+}
+
+define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
+; GFX9-LABEL: void_func_i1_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1_i1() {
+; GFX9-LABEL: test_call_void_func_i1_i1:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s10, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_i1_i1 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_i1_i1 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], -1
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+;
+; GFX11-LABEL: test_call_void_func_i1_i1:
+; GFX11:     ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, void_func_i1_i1 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_i1_i1 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s1, -1
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_i1(i1 %val, i1 true)
+  ret void
+}
+
+define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
+; GFX9-LABEL: void_func_a2i1_i1:
+; GFX9:        ; %bb.0:
+; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:     v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT:     global_store_byte v[0:1], v1, off
+; GFX9-NEXT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:     global_store_byte v[0:1], v0, off
+; GFX9-NEXT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX9-NEXT:     global_store_byte v[0:1], v0, off
+; GFX9-NEXT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:     s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a2i1_i1:
+; GFX11:        ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile [2 x i1] %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @many_i1_args(
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: many_i1_args:
+; GFX9:      ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[8:9]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[10:11]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[12:13]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[14:15]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[16:17]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[18:19]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[20:21]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[22:23]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[24:25]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[26:27]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[28:29]
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: many_i1_args:
+; GFX11:      ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s3
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s11
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s16
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s21
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s26
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s27
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s29
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i1 %arg2, ptr addrspace(1) undef
+  store volatile i1 %arg3, ptr addrspace(1) undef
+  store volatile i1 %arg4, ptr addrspace(1) undef
+  store volatile i1 %arg5, ptr addrspace(1) undef
+  store volatile i1 %arg6, ptr addrspace(1) undef
+  store volatile i1 %arg7, ptr addrspace(1) undef
+
+  store volatile i1 %arg8, ptr addrspace(1) undef
+  store volatile i1 %arg9, ptr addrspace(1) undef
+  store volatile i1 %arg10, ptr addrspace(1) undef
+  store volatile i1 %arg11, ptr addrspace(1) undef
+  store volatile i1 %arg12, ptr addrspace(1) undef
+  store volatile i1 %arg13, ptr addrspace(1) undef
+  store volatile i1 %arg14, ptr addrspace(1) undef
+  store volatile i1 %arg15, ptr addrspace(1) undef
+
+  store volatile i1 %arg16, ptr addrspace(1) undef
+  store volatile i1 %arg17, ptr addrspace(1) undef
+  store volatile i1 %arg18, ptr addrspace(1) undef
+  store volatile i1 %arg19, ptr addrspace(1) undef
+  store volatile i1 %arg20, ptr addrspace(1) undef
+  store volatile i1 %arg21, ptr addrspace(1) undef
+  store volatile i1 %arg22, ptr addrspace(1) undef
+  store volatile i1 %arg23, ptr addrspace(1) undef
+
+  store volatile i1 %arg24, ptr addrspace(1) undef
+  store volatile i1 %arg25, ptr addrspace(1) undef
+  store volatile i1 %arg26, ptr addrspace(1) undef
+  store volatile i1 %arg27, ptr addrspace(1) undef
+  store volatile i1 %arg28, ptr addrspace(1) undef
+  store volatile i1 %arg29, ptr addrspace(1) undef
+  store volatile i1 %arg30, ptr addrspace(1) undef
+  store volatile i1 %arg31, ptr addrspace(1) undef
+
+  ret void
+}
+
+define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: void_func_i1_i1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    s_and_b32 s4, s6, 1
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_i1_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_and_b32 s0, s1, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
+; GFX9-LABEL: void_func_i1_inreg_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index df2163c4f9578..fb5b4a704b8a1 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -40,6 +40,8 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_zeroext_func_void:
@@ -49,6 +51,9 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
@@ -62,7 +67,8 @@ define signext i1 @i1_signext_func_void() #0 {
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
-; GFX789-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX789-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_signext_func_void:
@@ -72,7 +78,9 @@ define signext i1 @i1_signext_func_void() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val

>From e6e574dabfd9a4cacdbe7924aa0c23dc47e413f5 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 22 Mar 2024 16:15:26 -0500
Subject: [PATCH 14/25] This commit: (1) a fix for i1 return with GlobalISel
 (2) testcases.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  17 +-
 .../GlobalISel/function-call-i1-return.ll     |  33 ++--
 .../AMDGPU/GlobalISel/function-returns.ll     |  21 +--
 .../CodeGen/AMDGPU/function-call-i1-return.ll | 146 ++++++++++++++----
 llvm/test/CodeGen/AMDGPU/function-i1-args.ll  |  88 ++++++++++-
 5 files changed, 217 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 2d25827906f15..2b2584e6cbe40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -63,6 +63,11 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
+    if (VA.getLocVT() == MVT::i1 && MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
+      MIRBuilder.buildCopy(PhysReg, ValVReg);
+      return;
+    }
+
     Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
 
     // If this is a scalar return, insert a readfirstlane just in case the value
@@ -88,9 +93,6 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
                                         {MRI.getType(ExtReg)})
                         .addReg(ExtReg);
       ExtReg = ToSGPR.getReg(0);
-      if (VA.getLocVT() == MVT::i1 &&
-          MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
-        ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ExtReg).getReg(0);
     }
 
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -127,12 +129,9 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
     if (VA.getLocVT().getSizeInBits() < 32) {
       // 16-bit types are reported as legal for 32-bit registers. We need to do
       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
-      unsigned CopyToBits = 32;
-
-      // When function return type is i1, it may be in a 64b register.
-      if (VA.getLocVT() == MVT::i1 &&
-          MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
-        CopyToBits = 64;
+      //
+      // However, when function return type is i1, it may be in a 64b register.
+      unsigned CopyToBits = (VA.getLocVT() == MVT::i1 && MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) ? 64 : 32;
 
       auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 24a51a9904d25..86198dd70218b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -7,11 +7,8 @@ define i1 @i1_func_void() {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX9-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-; GFX9-NEXT:    SI_RETURN implicit $sgpr0_sgpr1
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: i1_func_void
 ; GFX11: bb.1 (%ir-block.0):
@@ -61,11 +58,8 @@ define zeroext i1 @zeroext_i1_func_void() {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX9-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-; GFX9-NEXT:    SI_RETURN implicit $sgpr0_sgpr1
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: zeroext_i1_func_void
 ; GFX11: bb.1 (%ir-block.0):
@@ -115,11 +109,8 @@ define signext i1 @signext_i1_func_void() {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX9-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-; GFX9-NEXT:    SI_RETURN implicit $sgpr0_sgpr1
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: signext_i1_func_void
 ; GFX11: bb.1 (%ir-block.0):
@@ -223,15 +214,9 @@ define [2 x i1] @a2i1_func_void() {
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
 ; GFX9-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
 ; GFX9-NEXT:    [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
-; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX9-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-; GFX9-NEXT:    [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s1)
-; GFX9-NEXT:    [[INTRIN2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT3]](s32)
-; GFX9-NEXT:    [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN2]](s32)
-; GFX9-NEXT:    $sgpr2_sgpr3 = COPY [[ANYEXT4]](s64)
-; GFX9-NEXT:    SI_RETURN implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr2_sgpr3 = COPY [[LOAD2]](s1)
+; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: a2i1_func_void
 ; GFX11: bb.1 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index 117a654d853f5..252afe1712464 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -8,11 +8,8 @@ define i1 @i1_func_void() #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-  ; CHECK-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+  ; CHECK-NEXT:   SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
@@ -22,11 +19,8 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-  ; CHECK-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+  ; CHECK-NEXT:   SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
@@ -36,11 +30,8 @@ define signext i1 @i1_signext_func_void() #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-  ; CHECK-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+  ; CHECK-NEXT:   SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
index 91c739701a1a8..5319bbac3a087 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -1,26 +1,41 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+ at GV = external addrspace(1) global i32
 
 define i1 @i1_func_void() {
 ; GFX9-LABEL: i1_func_void:
 ; GFX9:    ; %bb.0:
-; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:     global_load_ubyte v0, v[0:1], off
-; GFX9-NEXT:     s_waitcnt vmcnt(0)
-; GFX9-NEXT:     v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:     v_cmp_eq_u32_e64 s[0:1], 1, v0
-; GFX9-NEXT:     s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_func_void:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load i1, ptr addrspace(1) undef
+  %val = load i1, ptr addrspace(1) @GV
   ret i1 %val
 }
 
@@ -46,7 +61,13 @@ define zeroext i1 @zeroext_i1_func_void() {
 ; GFX9-LABEL: zeroext_i1_func_void:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
@@ -55,13 +76,19 @@ define zeroext i1 @zeroext_i1_func_void() {
 ; GFX11-LABEL: zeroext_i1_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load i1, ptr addrspace(1) undef
+  %val = load i1, ptr addrspace(1) @GV
   ret i1 %val
 }
 
@@ -72,7 +99,7 @@ define void @test_call_zeroext_i1_func_void() {
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ;
 ; GFX11-LABEL: test_call_zeroext_i1_func_void:
-; GFX11:         s_swappc_b64 s[30:31], s[4:5]
+; GFX11:         s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
@@ -87,7 +114,13 @@ define signext i1 @signext_i1_func_void() {
 ; GFX9-LABEL: signext_i1_func_void:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
@@ -96,13 +129,19 @@ define signext i1 @signext_i1_func_void() {
 ; GFX11-LABEL: signext_i1_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load i1, ptr addrspace(1) undef
+  %val = load i1, ptr addrspace(1) @GV
   ret i1 %val
 }
 
@@ -113,7 +152,7 @@ define void @test_call_signext_i1_func_void() {
 ; GFX9-NEXT:     global_store_byte v[0:1], v0, off
 ;
 ; GFX11-LABEL: test_call_signext_i1_func_void:
-; GFX11:         s_swappc_b64 s[30:31], s[4:5]
+; GFX11:         s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
@@ -128,17 +167,29 @@ define inreg i1 @inreg_i1_func_void() {
 ; GFX9-LABEL: inreg_i1_func_void:
 ; GFX9:    ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: inreg_i1_func_void:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load i1, ptr addrspace(1) undef
+  %val = load i1, ptr addrspace(1) @GV
   ret i1 %val
 }
 
@@ -164,33 +215,64 @@ define [2 x i1] @a2i1_func_void() {
 ; GFX9-LABEL: a2i1_func_void:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[4:5] offset:1
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %val = load [2 x i1], ptr addrspace(1) undef
+;
+; GFX11-LABEL: a2i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_u8 v1, v0, s[0:1]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load [2 x i1], ptr addrspace(1) @GV
   ret [2 x i1] %val
 }
 
 define void @test_call_a2i1_func_void() {
 ; GFX9-LABEL: test_call_a2i1_func_void:
+; GFX9:         s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ;
+; GFX11-LABEL: test_call_a2i1_func_void:
 ; GFX11:         s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, s1, 0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, s0, 0
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
   %val = call [2 x i1] @a2i1_func_void()
   store volatile [2 x i1] %val, ptr addrspace(1) undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
index 55f3422e5c834..c9877db735ebb 100644
--- a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -21,7 +21,7 @@ define void @void_func_i1(i1 %arg0) {
   ret void
 }
 
-define void @test_call_void_func_i1() {
+define void @test_call_void_func_i1(ptr addrspace(1) %in) {
 ; GFX9-LABEL: test_call_void_func_i1:
 ; GFX9:    ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -66,7 +66,7 @@ define void @test_call_void_func_i1() {
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-  %val = load i1, ptr addrspace(1) undef
+  %val = load i1, ptr addrspace(1) %in
   call void @void_func_i1(i1 %val)
   ret void
 }
@@ -95,7 +95,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
   ret void
 }
 
-define void @test_call_void_func_i1_zeroext() {
+define void @test_call_void_func_i1_zeroext(ptr addrspace(1) %in) {
 ; GFX9-LABEL: test_call_void_func_i1_zeroext:
 ; GFX9:    ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -140,7 +140,7 @@ define void @test_call_void_func_i1_zeroext() {
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-  %val = load i1, ptr addrspace(1) undef
+  %val = load i1, ptr addrspace(1) %in
   call void @void_func_i1_zeroext(i1 %val)
   ret void
 }
@@ -169,7 +169,7 @@ define void @void_func_i1_signext(i1 signext %arg0) {
   ret void
 }
 
-define void @test_call_void_func_i1_signext() {
+define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
 ; GFX9-LABEL: test_call_void_func_i1_signext:
 ; GFX9:    ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -214,7 +214,7 @@ define void @test_call_void_func_i1_signext() {
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-  %val = load i1, ptr addrspace(1) undef
+  %val = load i1, ptr addrspace(1) %in
   call void @void_func_i1_signext(i1 %val)
   ret void
 }
@@ -401,6 +401,78 @@ define void @test_call_void_func_v2i1() {
   ret void
 }
 
+define void @void_func_v2i1_inreg(<2 x i1> inreg %arg0) {
+; GFX9-LABEL: void_func_v2i1_inreg:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e64 v0, 1, s5
+; GFX9-NEXT:    v_and_b32_e64 v1, s4, 1
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i1_inreg:
+; GFX11:    ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b16 v0, 1, s1
+; GFX11-NEXT:    v_and_b32_e64 v1, s0, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x i1> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_v2i1_inreg() {
+; GFX9-LABEL: test_call_void_func_v2i1_inreg:
+; GFX9:    ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_v2i1_inreg at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_v2i1_inreg at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 1
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_v2i1_inreg:
+; GFX11:    ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, void_func_v2i1_inreg at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_v2i1_inreg at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 1
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+  %1 = insertelement <2 x i1> undef, i1 0, i32 0
+  %2 = insertelement <2 x i1> %1, i1 1, i32 1
+  call void @void_func_v2i1_inreg(<2 x i1> %2)
+  ret void
+}
+
 define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
 ; GFX9-LABEL: void_func_i1_i1:
 ; GFX9:       ; %bb.0:
@@ -428,7 +500,7 @@ define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
   ret void
 }
 
-define void @test_call_void_func_i1_i1() {
+define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
 ; GFX9-LABEL: test_call_void_func_i1_i1:
 ; GFX9:    ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -475,7 +547,7 @@ define void @test_call_void_func_i1_i1() {
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-  %val = load i1, ptr addrspace(1) undef
+  %val = load i1, ptr addrspace(1) %in
   call void @void_func_i1_i1(i1 %val, i1 true)
   ret void
 }

>From 4f54c9847c5b9abb98c78e809b82693bd6480421 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 22 Mar 2024 16:25:27 -0500
Subject: [PATCH 15/25] Fix formatting.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 2b2584e6cbe40..6f2425c71f09a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -63,7 +63,8 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
-    if (VA.getLocVT() == MVT::i1 && MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
+    if (VA.getLocVT() == MVT::i1 &&
+        MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
       MIRBuilder.buildCopy(PhysReg, ValVReg);
       return;
     }
@@ -131,7 +132,11 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
       //
       // However, when function return type is i1, it may be in a 64b register.
-      unsigned CopyToBits = (VA.getLocVT() == MVT::i1 && MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) ? 64 : 32;
+      unsigned CopyToBits =
+          (VA.getLocVT() == MVT::i1 &&
+           MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
+              ? 64
+              : 32;
 
       auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
 
@@ -276,7 +281,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
     assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
   }
 };
-}
+} // namespace
 
 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
   : CallLowering(&TLI) {

>From a79ddaeaf0fe54d14d8dcfb7d582884861c76263 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 1 Apr 2024 13:18:42 -0500
Subject: [PATCH 16/25] Use update_llc_test_checks.py on new test files; remove
 incorrect comments in 2 new test files.

---
 .../GlobalISel/function-call-i1-return.ll     |   1 -
 .../AMDGPU/GlobalISel/function-i1-args.ll     |   1 -
 .../CodeGen/AMDGPU/function-call-i1-return.ll | 276 ++++++++++++++--
 llvm/test/CodeGen/AMDGPU/function-i1-args.ll  | 294 +++++++++++++-----
 4 files changed, 479 insertions(+), 93 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 86198dd70218b..81a1994b5afb1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9 -enable-var-scope %s
 ; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index f4c85df0e0a1b..134751ee1e313 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9 -enable-var-scope %s
 ; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
index 5319bbac3a087..0b3366f71d89c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -6,7 +6,7 @@
 
 define i1 @i1_func_void() {
 ; GFX9-LABEL: i1_func_void:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, GV at gotpcrel32@lo+4
@@ -21,7 +21,7 @@ define i1 @i1_func_void() {
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_func_void:
-; GFX11:   ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, GV at gotpcrel32@lo+4
@@ -41,17 +41,65 @@ define i1 @i1_func_void() {
 
 define void @test_call_i1_func_void() {
 ; GFX9-LABEL: test_call_i1_func_void:
-; GFX9:          s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:     global_store_byte v[0:1], v0, off
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_i1_func_void:
-; GFX11:         s_swappc_b64 s[30:31], s[0:1]
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call i1 @i1_func_void()
   store volatile i1 %val, ptr addrspace(1) undef
   ret void
@@ -94,17 +142,65 @@ define zeroext i1 @zeroext_i1_func_void() {
 
 define void @test_call_zeroext_i1_func_void() {
 ; GFX9-LABEL: test_call_zeroext_i1_func_void:
-; GFX9:         s_swappc_b64 s[30:31], s[4:5]
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, zeroext_i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, zeroext_i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_zeroext_i1_func_void:
-; GFX11:         s_swappc_b64 s[30:31], s[0:1]
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, zeroext_i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, zeroext_i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call i1 @zeroext_i1_func_void()
   store volatile i1 %val, ptr addrspace(1) undef
   ret void
@@ -147,17 +243,65 @@ define signext i1 @signext_i1_func_void() {
 
 define void @test_call_signext_i1_func_void() {
 ; GFX9-LABEL: test_call_signext_i1_func_void:
-; GFX9:          s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:     global_store_byte v[0:1], v0, off
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, signext_i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, signext_i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_signext_i1_func_void:
-; GFX11:         s_swappc_b64 s[30:31], s[0:1]
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, signext_i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, signext_i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call i1 @signext_i1_func_void()
   store volatile i1 %val, ptr addrspace(1) undef
   ret void
@@ -165,7 +309,7 @@ define void @test_call_signext_i1_func_void() {
 
 define inreg i1 @inreg_i1_func_void() {
 ; GFX9-LABEL: inreg_i1_func_void:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, GV at gotpcrel32@lo+4
@@ -178,7 +322,7 @@ define inreg i1 @inreg_i1_func_void() {
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: inreg_i1_func_void:
-; GFX11:   ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, GV at gotpcrel32@lo+4
@@ -195,17 +339,65 @@ define inreg i1 @inreg_i1_func_void() {
 
 define void @test_call_inreg_i1_func_void() {
 ; GFX9-LABEL: test_call_inreg_i1_func_void:
-; GFX9:         s_swappc_b64 s[30:31], s[4:5]
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, inreg_i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, inreg_i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_inreg_i1_func_void:
-; GFX11:         s_swappc_b64 s[30:31], s[0:1]
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, inreg_i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, inreg_i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call i1 @inreg_i1_func_void()
   store volatile i1 %val, ptr addrspace(1) undef
   ret void
@@ -257,15 +449,55 @@ define [2 x i1] @a2i1_func_void() {
 
 define void @test_call_a2i1_func_void() {
 ; GFX9-LABEL: test_call_a2i1_func_void:
-; GFX9:         s_swappc_b64 s[30:31], s[4:5]
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, a2i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, a2i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_a2i1_func_void:
-; GFX11:         s_swappc_b64 s[30:31], s[0:1]
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, a2i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, a2i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
@@ -273,6 +505,14 @@ define void @test_call_a2i1_func_void() {
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call [2 x i1] @a2i1_func_void()
   store volatile [2 x i1] %val, ptr addrspace(1) undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
index c9877db735ebb..2d63695674404 100644
--- a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -23,7 +23,7 @@ define void @void_func_i1(i1 %arg0) {
 
 define void @test_call_void_func_i1(ptr addrspace(1) %in) {
 ; GFX9-LABEL: test_call_void_func_i1:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@@ -43,14 +43,23 @@ define void @test_call_void_func_i1(ptr addrspace(1) %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_void_func_i1:
-; GFX11:     ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s4, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -66,6 +75,15 @@ define void @test_call_void_func_i1(ptr addrspace(1) %in) {
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) %in
   call void @void_func_i1(i1 %val)
   ret void
@@ -97,7 +115,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
 
 define void @test_call_void_func_i1_zeroext(ptr addrspace(1) %in) {
 ; GFX9-LABEL: test_call_void_func_i1_zeroext:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@@ -117,14 +135,23 @@ define void @test_call_void_func_i1_zeroext(ptr addrspace(1) %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_void_func_i1_zeroext:
-; GFX11:     ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s4, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -140,6 +167,15 @@ define void @test_call_void_func_i1_zeroext(ptr addrspace(1) %in) {
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) %in
   call void @void_func_i1_zeroext(i1 %val)
   ret void
@@ -171,7 +207,7 @@ define void @void_func_i1_signext(i1 signext %arg0) {
 
 define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
 ; GFX9-LABEL: test_call_void_func_i1_signext:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@@ -191,14 +227,23 @@ define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_void_func_i1_signext:
-; GFX11:     ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s4, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -214,6 +259,15 @@ define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) %in
   call void @void_func_i1_signext(i1 %val)
   ret void
@@ -221,17 +275,17 @@ define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
 
 define void @void_func_a2i1([2 x i1] %arg0) {
 ; GFX9-LABEL: void_func_a2i1:
-; GFX9:        ; %bb.0:
-; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT:     v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GFX9-NEXT:     global_store_byte v[0:1], v1, off
-; GFX9-NEXT:     global_store_byte v[0:1], v0, off
-; GFX9-NEXT:     s_waitcnt vmcnt(0)
-; GFX9-NEXT:     s_setpc_b64 s[30:31]
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT:    global_store_byte v[0:1], v1, off
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_a2i1:
-; GFX11:        ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
@@ -245,7 +299,7 @@ define void @void_func_a2i1([2 x i1] %arg0) {
 
 define void @test_call_void_func_a2i1() {
 ; GFX9-LABEL: test_call_void_func_a2i1:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s10, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@@ -263,10 +317,45 @@ define void @test_call_void_func_a2i1() {
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s10
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_void_func_a2i1:
-; GFX11:     ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, void_func_a2i1 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_a2i1 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, -1
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-NEXT;    s_mov_b32 s4, s33
 ; GFX11-NEXT;    s_mov_b32 s33, s32
 ; GFX11-NEXT;    s_xor_saveexec_b32 s0, -1
@@ -293,18 +382,20 @@ define void @i1_arg_i1_use(i1 %arg) {
 ; CIGFX89-LABEL: i1_arg_i1_use:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
-; GFX9:        ; %bb.0:
-; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:     s_xor_b64 s[6:7], s[4:5], -1
-; GFX9-NEXT:     s_and_saveexec_b64 s[4:5], s[6:7]
-; GFX9:        ; %bb.1:
-; GFX9-NEXT:     v_mov_b32_e32 v0, 0
-; GFX9-NEXT:     global_store_dword v[0:1], v0, off
-; GFX9-NEXT:     s_waitcnt vmcnt(0)
-; GFX9-NEXT:   .LBB{{[0-9]+}}_2:
-; GFX9-NEXT:     s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:     s_waitcnt vmcnt(0)
-; GFX9-NEXT:     s_setpc_b64 s[30:31]
+; GFX9-LABEL: i1_arg_i1_use:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
+; GFX9-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-NEXT:  ; %bb.1: ; %bb1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:  .LBB8_2: ; %bb2
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_arg_i1_use:
 ; GFX11:       ; %bb.0: ; %bb
@@ -312,11 +403,12 @@ define void @i1_arg_i1_use(i1 %arg) {
 ; GFX11-NEXT:    s_xor_b32 s1, s0, -1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s1
-; GFX11:       ; %bb.1: ; %bb1
+; GFX11-NEXT:    s_cbranch_execz .LBB8_2
+; GFX11-NEXT:  ; %bb.1: ; %bb1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:  .LBB{{[0-9]+}}_2: ; %bb2
+; GFX11-NEXT:  .LBB8_2: ; %bb2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -332,15 +424,15 @@ bb2:
 
 define void @void_func_v2i1(<2 x i1> %arg0) {
 ; GFX9-LABEL: void_func_v2i1:
-; GFX9:          ; %bb.0:
-; GFX9-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:       v_lshlrev_b16_e32 v1, 1, v1
-; GFX9-NEXT:       v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:       v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:       v_and_b32_e32 v0, 3, v0
-; GFX9-NEXT:       global_store_byte v[0:1], v0, off
-; GFX9-NEXT:       s_waitcnt vmcnt(0)
-; GFX9-NEXT:       s_setpc_b64 s[30:31]
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i1:
 ; GFX11:       ; %bb.0:
@@ -358,7 +450,7 @@ define void @void_func_v2i1(<2 x i1> %arg0) {
 
 define void @test_call_void_func_v2i1() {
 ; GFX9-LABEL: test_call_void_func_v2i1:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@@ -376,14 +468,23 @@ define void @test_call_void_func_v2i1() {
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_void_func_v2i1:
-; GFX11:     ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -395,6 +496,16 @@ define void @test_call_void_func_v2i1() {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %1 = insertelement <2 x i1> undef, i1 0, i32 0
   %2 = insertelement <2 x i1> %1, i1 1, i32 1
   call void @void_func_v2i1(<2 x i1> %2)
@@ -403,7 +514,7 @@ define void @test_call_void_func_v2i1() {
 
 define void @void_func_v2i1_inreg(<2 x i1> inreg %arg0) {
 ; GFX9-LABEL: void_func_v2i1_inreg:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v0, 1, s5
 ; GFX9-NEXT:    v_and_b32_e64 v1, s4, 1
@@ -414,7 +525,7 @@ define void @void_func_v2i1_inreg(<2 x i1> inreg %arg0) {
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i1_inreg:
-; GFX11:    ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, s1
 ; GFX11-NEXT:    v_and_b32_e64 v1, s0, 1
@@ -429,7 +540,7 @@ define void @void_func_v2i1_inreg(<2 x i1> inreg %arg0) {
 
 define void @test_call_void_func_v2i1_inreg() {
 ; GFX9-LABEL: test_call_void_func_v2i1_inreg:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@@ -447,14 +558,23 @@ define void @test_call_void_func_v2i1_inreg() {
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_void_func_v2i1_inreg:
-; GFX11:    ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s4, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -467,6 +587,16 @@ define void @test_call_void_func_v2i1_inreg() {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %1 = insertelement <2 x i1> undef, i1 0, i32 0
   %2 = insertelement <2 x i1> %1, i1 1, i32 1
   call void @void_func_v2i1_inreg(<2 x i1> %2)
@@ -502,7 +632,7 @@ define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
 
 define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
 ; GFX9-LABEL: test_call_void_func_i1_i1:
-; GFX9:    ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s10, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@@ -523,14 +653,23 @@ define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s10
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_call_void_func_i1_i1:
-; GFX11:     ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s4, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -547,6 +686,15 @@ define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) %in
   call void @void_func_i1_i1(i1 %val, i1 true)
   ret void
@@ -554,21 +702,21 @@ define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
 
 define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
 ; GFX9-LABEL: void_func_a2i1_i1:
-; GFX9:        ; %bb.0:
-; GFX9-NEXT:     s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT:     v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GFX9-NEXT:     global_store_byte v[0:1], v1, off
-; GFX9-NEXT:     s_waitcnt vmcnt(0)
-; GFX9-NEXT:     global_store_byte v[0:1], v0, off
-; GFX9-NEXT:     s_waitcnt vmcnt(0)
-; GFX9-NEXT:     v_cndmask_b32_e64 v0, 0, 1, s[8:9]
-; GFX9-NEXT:     global_store_byte v[0:1], v0, off
-; GFX9-NEXT:     s_waitcnt vmcnt(0)
-; GFX9-NEXT:     s_setpc_b64 s[30:31]
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT:    global_store_byte v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_a2i1_i1:
-; GFX11:        ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
@@ -586,12 +734,8 @@ define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
 }
 
 define void @many_i1_args(
-  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
-  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
-  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
-  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
 ; GFX9-LABEL: many_i1_args:
-; GFX9:      ; %bb.0:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
 ; GFX9-NEXT:    global_store_byte v[0:1], v19, off
@@ -692,7 +836,7 @@ define void @many_i1_args(
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: many_i1_args:
-; GFX11:      ; %bb.0:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s1
@@ -791,6 +935,10 @@ define void @many_i1_args(
 ; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
   store volatile i1 %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
   store volatile i1 %arg2, ptr addrspace(1) undef

>From d3338c9365a2d77b15b5665c1e08a7493d635dca Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 29 Apr 2024 19:04:50 -0500
Subject: [PATCH 17/25] For GlobalISel: (1) for incoming i1 arg/return, do not
 generate G_TRUNC; however, we need to set the register class to avoid later
 problems with instruction selection (2) for outgoing i1, do not differenciate
 between wavesize32 and wavesize64.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  27 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   6 +
 .../GlobalISel/function-call-i1-return.ll     |  78 ++----
 .../AMDGPU/GlobalISel/function-i1-args.ll     | 252 +++++++-----------
 .../irtranslator-call-return-values.ll        |  15 +-
 .../GlobalISel/irtranslator-function-args.ll  |  34 +--
 .../GlobalISel/irtranslator-invariant.ll      |   5 +-
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |   2 -
 8 files changed, 165 insertions(+), 254 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 6f2425c71f09a..5e42ecd0f956d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -63,8 +63,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
-    if (VA.getLocVT() == MVT::i1 &&
-        MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
+    if (VA.getLocVT() == MVT::i1) {
       MIRBuilder.buildCopy(PhysReg, ValVReg);
       return;
     }
@@ -79,7 +78,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
     if (TRI->isSGPRReg(MRI, PhysReg)) {
       LLT Ty = MRI.getType(ExtReg);
       LLT S32 = LLT::scalar(32);
-      if (Ty != S32 && Ty != LLT::scalar(64)) {
+      if (Ty != S32) {
         // FIXME: We should probably support readfirstlane intrinsics with all
         // legal 32-bit types.
         assert(Ty.getSizeInBits() == 32);
@@ -127,18 +126,19 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
                         const CCValAssign &VA) override {
     markPhysRegUsed(PhysReg);
 
+    if (VA.getLocVT() == MVT::i1) {
+      MIRBuilder.buildCopy(ValVReg, PhysReg);
+      MRI.setRegClass(ValVReg, MIRBuilder.getMF()
+                                   .getSubtarget<GCNSubtarget>()
+                                   .getRegisterInfo()
+                                   ->getBoolRC());
+      return;
+    }
+
     if (VA.getLocVT().getSizeInBits() < 32) {
       // 16-bit types are reported as legal for 32-bit registers. We need to do
       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
-      //
-      // However, when function return type is i1, it may be in a 64b register.
-      unsigned CopyToBits =
-          (VA.getLocVT() == MVT::i1 &&
-           MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
-              ? 64
-              : 32;
-
-      auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
+      auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
 
       // If we have signext/zeroext, it applies to the whole 32-bit register
       // before truncation.
@@ -248,8 +248,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
                         const CCValAssign &VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
 
-    if (VA.getLocVT() == MVT::i1 &&
-        MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
+    if (VA.getLocVT() == MVT::i1) {
       MIRBuilder.buildCopy(PhysReg, ValVReg);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b48a09489653a..f12fe7f1118ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -131,6 +131,12 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
   Register SrcReg = Src.getReg();
 
   if (isVCC(DstReg, *MRI)) {
+    if (SrcReg.isPhysical() && SrcReg != AMDGPU::SCC) {
+      const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(DstReg);
+      if (DstRC)
+        return DstRC->contains(SrcReg);
+    }
+
     if (SrcReg == AMDGPU::SCC) {
       const TargetRegisterClass *RC
         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 81a1994b5afb1..32c7c434d4716 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -13,10 +13,8 @@ define i1 @i1_func_void() {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX11-NEXT:    $sgpr0 = COPY [[INTRIN]](s32)
-; GFX11-NEXT:    SI_RETURN implicit $sgpr0
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT:    SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
@@ -30,10 +28,9 @@ define void @test_call_i1_func_void() {
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
 ; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: test_call_i1_func_void
@@ -42,10 +39,9 @@ define void @test_call_i1_func_void() {
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   %val = call i1 @i1_func_void()
   store volatile i1 %val, ptr addrspace(1) undef
@@ -64,10 +60,8 @@ define zeroext i1 @zeroext_i1_func_void() {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX11-NEXT:    $sgpr0 = COPY [[INTRIN]](s32)
-; GFX11-NEXT:    SI_RETURN implicit $sgpr0
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT:    SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
@@ -81,10 +75,9 @@ define void @test_call_zeroext_i1_func_void() {
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
 ; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: test_call_zeroext_i1_func_void
@@ -93,10 +86,9 @@ define void @test_call_zeroext_i1_func_void() {
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   %val = call i1 @zeroext_i1_func_void()
   store volatile i1 %val, ptr addrspace(1) undef
@@ -115,10 +107,8 @@ define signext i1 @signext_i1_func_void() {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX11-NEXT:    $sgpr0 = COPY [[INTRIN]](s32)
-; GFX11-NEXT:    SI_RETURN implicit $sgpr0
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT:    SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
@@ -132,10 +122,9 @@ define void @test_call_signext_i1_func_void() {
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
 ; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: test_call_signext_i1_func_void
@@ -144,10 +133,9 @@ define void @test_call_signext_i1_func_void() {
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   %val = call i1 @signext_i1_func_void()
   store volatile i1 %val, ptr addrspace(1) undef
@@ -224,13 +212,9 @@ define [2 x i1] @a2i1_func_void() {
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
 ; GFX11-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
 ; GFX11-NEXT:    [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX11-NEXT:    $sgpr0 = COPY [[INTRIN]](s32)
-; GFX11-NEXT:    [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s1)
-; GFX11-NEXT:    [[INTRIN2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT3]](s32)
-; GFX11-NEXT:    $sgpr1 = COPY [[INTRIN2]](s32)
-; GFX11-NEXT:    SI_RETURN implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT:    $sgpr1 = COPY [[LOAD2]](s1)
+; GFX11-NEXT:    SI_RETURN
   %val = load [2 x i1], ptr addrspace(1) undef
   ret [2 x i1] %val
 }
@@ -244,15 +228,13 @@ define void @test_call_a2i1_func_void() {
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
 ; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1, implicit-def $sgpr2_sgpr3
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; GFX9-NEXT:    [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
-; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr2_sgpr3
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
 ; GFX9-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
-; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY3]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: test_call_a2i1_func_void
@@ -261,15 +243,13 @@ define void @test_call_a2i1_func_void() {
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a2i1_func_void
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit-def $sgpr0, implicit-def $sgpr1
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
 ; GFX11-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
-; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   %val = call [2 x i1] @a2i1_func_void()
   store volatile [2 x i1] %val, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index 134751ee1e313..3e554fc8b638b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -6,20 +6,18 @@ define void @void_func_i1(i1 %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: void_func_i1
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:   liveins: $sgpr0
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   store i1 %arg0, ptr addrspace(1) undef
   ret void
@@ -45,8 +43,7 @@ define void @test_call_void_func_i1() {
 ; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1  
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    SI_RETURN
@@ -60,11 +57,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT:    [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+; GFX9-NEXT:    [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
 ; GFX9-NEXT:    [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[CONST]]
 ; GFX9-NEXT:    G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
@@ -73,11 +69,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT:    [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+; GFX11-NEXT:    [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
 ; GFX11-NEXT:    [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[CONST]]
 ; GFX11-NEXT:    G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
@@ -107,8 +102,7 @@ define void @test_call_void_func_i1_zeroext() {
 ; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_zeroext 
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    SI_RETURN
@@ -122,11 +116,10 @@ define void @void_func_i1_signext(i1 signext %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT:    [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+; GFX9-NEXT:    [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
 ; GFX9-NEXT:    [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[CONST]]
 ; GFX9-NEXT:    G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
@@ -135,11 +128,10 @@ define void @void_func_i1_signext(i1 signext %arg0) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT:    [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+; GFX11-NEXT:    [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
 ; GFX11-NEXT:    [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[CONST]]
 ; GFX11-NEXT:    G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
@@ -169,8 +161,7 @@ define void @test_call_void_func_i1_signext() {
 ; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_signext 
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    SI_RETURN
@@ -184,30 +175,26 @@ define void @void_func_a2i1([2 x i1] %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
-; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
 ; GFX9-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
-; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: void_func_a2i1
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
 ; GFX11-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
-; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   store [2 x i1] %arg0, ptr addrspace(1) undef
   ret void
@@ -234,10 +221,8 @@ define void @test_call_void_func_a2i1() {
 ; GFX11-NEXT:    [[CONST2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_a2i1 
-; GFX11-NEXT:    [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST1]](s1)
-; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT1]](s32)
-; GFX11-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST2]](s1)
-; GFX11-NEXT:    $sgpr1 = COPY [[ANYEXT2]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[CONST1]](s1)
+; GFX11-NEXT:    $sgpr1 = COPY [[CONST2]](s1)
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr0, implicit $sgpr1
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    SI_RETURN
@@ -252,26 +237,22 @@ define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
-; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: void_func_i1_i1
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   store volatile i1 %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -301,10 +282,8 @@ define void @test_call_void_func_i1_i1() {
 ; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_i1
-; GFX11-NEXT:    [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT1]](s32)
-; GFX11-NEXT:    [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX11-NEXT:    $sgpr1 = COPY [[ANYEXT2]](s32)
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT:    $sgpr1 = COPY [[CONST]](s1)
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr0, implicit $sgpr1
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    SI_RETURN
@@ -322,32 +301,19 @@ define void @many_i1_args(
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; GFX9-NEXT:    [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
-; GFX9-NEXT:    [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
-; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; GFX9-NEXT:    [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
-; GFX9-NEXT:    [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
-; GFX9-NEXT:    [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
-; GFX9-NEXT:    [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
-; GFX9-NEXT:    [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
-; GFX9-NEXT:    [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
-; GFX9-NEXT:    [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-; GFX9-NEXT:    [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
-; GFX9-NEXT:    [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
-; GFX9-NEXT:    [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
-; GFX9-NEXT:    [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
-; GFX9-NEXT:    [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
-; GFX9-NEXT:    [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
-; GFX9-NEXT:    [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
-; GFX9-NEXT:    [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
-; GFX9-NEXT:    [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
-; GFX9-NEXT:    [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
-; GFX9-NEXT:    [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
-; GFX9-NEXT:    [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
-; GFX9-NEXT:    [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY1:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr8_sgpr9
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr10_sgpr11
+; GFX9-NEXT:    [[COPY4:%[0-9]+]]:sreg_64(s1) = COPY $sgpr12_sgpr13
+; GFX9-NEXT:    [[COPY5:%[0-9]+]]:sreg_64(s1) = COPY $sgpr14_sgpr15
+; GFX9-NEXT:    [[COPY6:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+; GFX9-NEXT:    [[COPY7:%[0-9]+]]:sreg_64(s1) = COPY $sgpr18_sgpr19
+; GFX9-NEXT:    [[COPY8:%[0-9]+]]:sreg_64(s1) = COPY $sgpr20_sgpr21
+; GFX9-NEXT:    [[COPY9:%[0-9]+]]:sreg_64(s1) = COPY $sgpr22_sgpr23
+; GFX9-NEXT:    [[COPY10:%[0-9]+]]:sreg_64(s1) = COPY $sgpr24_sgpr25
+; GFX9-NEXT:    [[COPY11:%[0-9]+]]:sreg_64(s1) = COPY $sgpr26_sgpr27
+; GFX9-NEXT:    [[COPY12:%[0-9]+]]:sreg_64(s1) = COPY $sgpr28_sgpr29
 ; GFX9-NEXT:    [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; GFX9-NEXT:    [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
 ; GFX9-NEXT:    [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -388,82 +354,58 @@ define void @many_i1_args(
 ; GFX9-NEXT:    [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
 ;
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to COPY1-COPY11 omitted
+; GFX9:         G_STORE [[COPY12]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[TRUNC13]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC14-TRUNC30 omitted
 ; GFX9:         G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ;
 ; GFX11-LABEL: name: many_i1_args
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
-; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
-; GFX11-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
-; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
-; GFX11-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
-; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
-; GFX11-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
-; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
-; GFX11-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
-; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
-; GFX11-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
-; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
-; GFX11-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
-; GFX11-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
-; GFX11-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
-; GFX11-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
-; GFX11-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
-; GFX11-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
-; GFX11-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
-; GFX11-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
-; GFX11-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
-; GFX11-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
-; GFX11-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
-; GFX11-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
-; GFX11-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
-; GFX11-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
-; GFX11-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
-; GFX11-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
-; GFX11-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
-; GFX11-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
-; GFX11-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
-; GFX11-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
-; GFX11-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
-; GFX11-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
-; GFX11-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr2
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY $sgpr3
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY $sgpr4
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY $sgpr5
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY $sgpr6
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY $sgpr7
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY $sgpr8
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY $sgpr9
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY $sgpr10
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY $sgpr11
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY $sgpr12
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY $sgpr13
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY $sgpr14
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY $sgpr15
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY $sgpr16
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY $sgpr17
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY $sgpr18
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY $sgpr19
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY $sgpr20
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY $sgpr21
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY $sgpr22
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY $sgpr23
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY $sgpr24
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:sreg_32(s1) = COPY $sgpr25
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:sreg_32(s1) = COPY $sgpr26
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:sreg_32(s1) = COPY $sgpr27
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:sreg_32(s1) = COPY $sgpr28
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:sreg_32(s1) = COPY $sgpr29
 ; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
 ; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
 ; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
 ;
 ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:   G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to COPY1-COPY28 omitted
+; GFX11:        G_STORE [[COPY29]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11-NEXT:   G_STORE [[TRUNC30]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   store volatile i1 %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -509,12 +451,11 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr6, $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
 ; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
@@ -522,12 +463,11 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
 ; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   store volatile i1 %arg0, ptr addrspace(1) undef
@@ -542,11 +482,10 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
 ; GFX9-NEXT: {{  $}}
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
 ; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
-; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: void_func_i1_inreg_i1
@@ -555,11 +494,10 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
 ; GFX11-NEXT: {{  $}}
 ; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
 ; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
   store volatile i1 %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 3db0acceec0b3..ec999149daed8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -199,10 +199,9 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY21]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call i1 @external_i1_func_void()
   store volatile i1 %val, ptr addrspace(1) undef
@@ -276,10 +275,9 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+  ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY21]](s1)
   ; GCN-NEXT:   G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call i1 @external_i1_zeroext_func_void()
@@ -336,10 +334,9 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+  ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY21]](s1)
   ; GCN-NEXT:   G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call i1 @external_i1_signext_func_void()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index eece4397d1855..5d2f794b94c4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -37,20 +37,18 @@ define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
   ;
   ; GFX11-LABEL: name: void_func_i1
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-  ; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; GFX11-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GFX11-NEXT:   G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GFX11-NEXT:   SI_RETURN
   store i1 %arg0, ptr addrspace(1) undef
   ret void
@@ -61,11 +59,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -80,11 +77,10 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -100,14 +96,13 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC]], [[C]]
-  ; CHECK-NEXT:   [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
-  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.2
+  ; CHECK-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY]], [[C]]
+  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
+  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.2
   ; CHECK-NEXT:   G_BR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:
@@ -1998,8 +1993,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
@@ -2009,7 +2003,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[COPY31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 
   ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[LOAD2]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index 6360c5c2cbb2e..aa6f518a3e30f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -24,11 +24,10 @@ define i32 @load_select_const_i32_gv(i1 %cond) {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
   ; CHECK-NEXT:   [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
-  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[TRUNC]](s1), [[GV]], [[GV1]]
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[COPY]](s1), [[GV]], [[GV1]]
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[SELECT]](p1) :: (dereferenceable invariant load (s32) from %ir.select, addrspace 1)
   ; CHECK-NEXT:   $vgpr0 = COPY [[LOAD]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 4d04d6b7570c2..40cd5d88d4a38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,8 +168,6 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-LABEL: localize_internal_globals:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, 1, s4
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]

>From 4a82212cb3f27454a09f36a9f5c8b7d67bb461da Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 13 May 2024 14:11:10 -0500
Subject: [PATCH 18/25] (1) avoid using reserved ScratchRSrcReg (2) update/add
 testcases.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  53 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h   |   7 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   6 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  10 +
 .../GlobalISel/function-call-i1-return.ll     |  18 +-
 .../AMDGPU/GlobalISel/function-i1-args.ll     | 518 +++++++++-
 .../AMDGPU/GlobalISel/function-returns.ll     |   6 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |  86 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 946 ++++++++++++------
 .../codegen-prepare-addrspacecast-non-null.ll |  46 +-
 .../AMDGPU/divergence-driven-trunc-to-i1.ll   |  12 +-
 llvm/test/CodeGen/AMDGPU/extract-load-i1.ll   |   2 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  25 +-
 .../AMDGPU/fsub-as-fneg-src-modifier.ll       | 212 ++--
 .../CodeGen/AMDGPU/function-call-i1-return.ll |  24 +-
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |   6 +-
 .../identical-subrange-spill-infloop.ll       | 599 ++++++-----
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     |  24 +-
 .../AMDGPU/lds-global-non-entry-func.ll       | 134 ++-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll |   6 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll    | 480 +++------
 .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll     | 598 ++++-------
 llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll   | 108 +-
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll         | 244 ++---
 .../AMDGPU/loop-on-function-argument.ll       |  12 +-
 .../test/CodeGen/AMDGPU/loop_exit_with_xor.ll |   4 +-
 ...p-var-out-of-divergent-loop-swdev407790.ll |  33 +-
 .../CodeGen/AMDGPU/mul24-pass-ordering.ll     |  16 +-
 .../si-annotate-nested-control-flows.ll       |  10 +-
 .../si-optimize-vgpr-live-range-dbg-instr.ll  |  22 +-
 .../AMDGPU/srem-seteq-illegal-types.ll        |  11 +-
 .../CodeGen/AMDGPU/stacksave_stackrestore.ll  |  56 +-
 .../AMDGPU/urem-seteq-illegal-types.ll        |  16 +-
 33 files changed, 2230 insertions(+), 2120 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5e42ecd0f956d..c69cf8c34a6b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -65,6 +65,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
                         const CCValAssign &VA) override {
     if (VA.getLocVT() == MVT::i1) {
       MIRBuilder.buildCopy(PhysReg, ValVReg);
+      MIB.addUse(PhysReg, RegState::Implicit);
       return;
     }
 
@@ -316,6 +317,31 @@ bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
   return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
 }
 
+/// Special handling for i1 return val: based on determineAndHandleAssignments()
+bool AMDGPUCallLowering::determineAndHandleAssignmentsForI1Return(
+    ValueHandler &Handler, ValueAssigner &Assigner,
+    SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
+    CallingConv::ID CallConv, bool IsVarArg) const {
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = MF.getFunction();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, F.getContext());
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.enableFlatScratch()) {
+    SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+  }
+
+  if (!determineAssignments(Assigner, Args, CCInfo))
+    return false;
+
+  return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder);
+}
+
 /// Lower the return value for the already existing \p Ret. This assumes that
 /// \p B's insertion point is correct.
 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
@@ -378,8 +404,13 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
 
   OutgoingValueAssigner Assigner(AssignFn);
   AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
-  return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
-                                       CC, F.isVarArg());
+
+  if (SplitEVTs.size() == 1 && SplitEVTs[0] == MVT::i1)
+    return determineAndHandleAssignmentsForI1Return(
+        RetHandler, Assigner, SplitRetInfos, B, CC, F.isVarArg());
+  else
+    return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
+                                         CC, F.isVarArg());
 }
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
@@ -1493,6 +1524,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       return false;
   }
 
+  if (!ST.enableFlatScratch()) {
+    SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+  }
+
   // Do the actual argument marshalling.
   SmallVector<Register, 8> PhysRegs;
 
@@ -1539,9 +1575,16 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                                       Info.IsVarArg);
     IncomingValueAssigner Assigner(RetAssignFn);
     CallReturnHandler Handler(MIRBuilder, MRI, MIB);
-    if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
-                                       Info.CallConv, Info.IsVarArg))
-      return false;
+    if (Info.OrigRet.Ty->isIntegerTy(1)) {
+      if (!determineAndHandleAssignmentsForI1Return(Handler, Assigner, InArgs,
+                                                    MIRBuilder, Info.CallConv,
+                                                    Info.IsVarArg))
+        return false;
+    } else {
+      if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
+                                         Info.CallConv, Info.IsVarArg))
+        return false;
+    }
   }
 
   uint64_t CalleePopBytes = NumBytes;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f2a547b..afe3a7a19601a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,6 +37,13 @@ class AMDGPUCallLowering final : public CallLowering {
   bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
                       ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
 
+  bool determineAndHandleAssignmentsForI1Return(ValueHandler &Handler,
+                                                ValueAssigner &Assigner,
+                                                SmallVectorImpl<ArgInfo> &Args,
+                                                MachineIRBuilder &MIRBuilder,
+                                                CallingConv::ID CallConv,
+                                                bool IsVarArg) const;
+
 public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56345d14a331c..aa44cca11f800 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3741,6 +3741,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     if (!DstBank)
       DstBank = SrcBank;
 
+    // For i1 return value, the dst reg is an SReg but we need to set the reg
+    // bank to VCCRegBank.
+    if (!MI.getOperand(0).getReg().isVirtual() &&
+        SrcBank == &AMDGPU::VCCRegBank)
+      DstBank = SrcBank;
+
     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
         cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 618fdd95f4a4b..d98045f422878 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3149,6 +3149,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
+  if (!Subtarget->enableFlatScratch())
+    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+
   // Analyze outgoing return values.
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
 
@@ -3228,6 +3231,13 @@ SDValue SITargetLowering::LowerCallResult(
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
+
+  if (!Subtarget->enableFlatScratch()) {
+    SIMachineFunctionInfo *FuncInfo =
+        DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+  }
+
   CCInfo.AnalyzeCallResult(Ins, RetCC);
 
   // Copy all of the result registers out of their specified physreg.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 32c7c434d4716..a022c13f38f9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -6,7 +6,7 @@ define i1 @i1_func_void() {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: i1_func_void
@@ -27,8 +27,8 @@ define void @test_call_i1_func_void() {
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
@@ -53,7 +53,7 @@ define zeroext i1 @zeroext_i1_func_void() {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: zeroext_i1_func_void
@@ -74,8 +74,8 @@ define void @test_call_zeroext_i1_func_void() {
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
@@ -100,7 +100,7 @@ define signext i1 @signext_i1_func_void() {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: signext_i1_func_void
@@ -121,8 +121,8 @@ define void @test_call_signext_i1_func_void() {
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index 3e554fc8b638b..47c4682196d60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -30,10 +30,10 @@ define void @test_call_void_func_i1() {
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1  
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    SI_RETURN
 ;
@@ -89,10 +89,10 @@ define void @test_call_void_func_i1_zeroext() {
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_zeroext 
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    SI_RETURN
 ;
@@ -148,10 +148,10 @@ define void @test_call_void_func_i1_signext() {
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_signext 
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    SI_RETURN
 ;
@@ -207,11 +207,11 @@ define void @test_call_void_func_a2i1() {
 ; GFX9-NEXT:    [[CONST2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
 ; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_a2i1 
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[CONST1]](s1)
-; GFX9-NEXT:    $sgpr2_sgpr3 = COPY [[CONST2]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[CONST1]](s1)
+; GFX9-NEXT:    $sgpr6_sgpr7 = COPY [[CONST2]](s1)
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    SI_RETURN
 ;
@@ -267,11 +267,11 @@ define void @test_call_void_func_i1_i1() {
 ; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_i1
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
-; GFX9-NEXT:    $sgpr2_sgpr3 = COPY [[CONST]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr6_sgpr7 = COPY [[CONST]](s1)
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    SI_RETURN
 ;
@@ -292,12 +292,12 @@ define void @test_call_void_func_i1_i1() {
   ret void
 }
 
-define void @many_i1_args(
+define void @exhaust_sgprs_by_i1_args(
   i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
   i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
   i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
   i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
-; GFX9-LABEL: name: many_i1_args
+; GFX9-LABEL: name: exhaust_sgprs_by_i1_args
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
 ; GFX9-NEXT: {{  $}}
@@ -361,7 +361,7 @@ define void @many_i1_args(
 ; G_STOREs to TRUNC14-TRUNC30 omitted
 ; GFX9:         G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ;
-; GFX11-LABEL: name: many_i1_args
+; GFX11-LABEL: name: exhaust_sgprs_by_i1_args
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
 ; GFX11-NEXT: {{  $}}
@@ -446,6 +446,237 @@ define void @many_i1_args(
   ret void
 }
 
+define void @void_func_a48i1([48 x i1] %arg0) {
+; GFX9-LABEL: name: void_func_a48i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY1:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr8_sgpr9
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr10_sgpr11
+; GFX9-NEXT:    [[COPY4:%[0-9]+]]:sreg_64(s1) = COPY $sgpr12_sgpr13
+; GFX9-NEXT:    [[COPY5:%[0-9]+]]:sreg_64(s1) = COPY $sgpr14_sgpr15
+; GFX9-NEXT:    [[COPY6:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+; GFX9-NEXT:    [[COPY7:%[0-9]+]]:sreg_64(s1) = COPY $sgpr18_sgpr19
+; GFX9-NEXT:    [[COPY8:%[0-9]+]]:sreg_64(s1) = COPY $sgpr20_sgpr21
+; GFX9-NEXT:    [[COPY9:%[0-9]+]]:sreg_64(s1) = COPY $sgpr22_sgpr23
+; GFX9-NEXT:    [[COPY10:%[0-9]+]]:sreg_64(s1) = COPY $sgpr24_sgpr25
+; GFX9-NEXT:    [[COPY11:%[0-9]+]]:sreg_64(s1) = COPY $sgpr26_sgpr27
+; GFX9-NEXT:    [[COPY12:%[0-9]+]]:sreg_64(s1) = COPY $sgpr28_sgpr29
+; GFX9-NEXT:    [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT:    [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX9-NEXT:    [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX9-NEXT:    [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX9-NEXT:    [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr2
+; GFX9-NEXT:    [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX9-NEXT:    [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr3
+; GFX9-NEXT:    [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX9-NEXT:    [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr4
+; GFX9-NEXT:    [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX9-NEXT:    [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr5
+; GFX9-NEXT:    [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX9-NEXT:    [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr6
+; GFX9-NEXT:    [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX9-NEXT:    [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr7
+; GFX9-NEXT:    [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX9-NEXT:    [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr8
+; GFX9-NEXT:    [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX9-NEXT:    [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr9
+; GFX9-NEXT:    [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX9-NEXT:    [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr10
+; GFX9-NEXT:    [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX9-NEXT:    [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr11
+; GFX9-NEXT:    [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX9-NEXT:    [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr12
+; GFX9-NEXT:    [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX9-NEXT:    [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr13
+; GFX9-NEXT:    [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX9-NEXT:    [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr14
+; GFX9-NEXT:    [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX9-NEXT:    [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr15
+; GFX9-NEXT:    [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX9-NEXT:    [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr16
+; GFX9-NEXT:    [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX9-NEXT:    [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr17
+; GFX9-NEXT:    [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX9-NEXT:    [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr18
+; GFX9-NEXT:    [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+; GFX9-NEXT:    [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr19
+; GFX9-NEXT:    [[TRUNC32:%[0-9]+]]:_(s1) = G_TRUNC [[COPY32]](s32)
+; GFX9-NEXT:    [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr20
+; GFX9-NEXT:    [[TRUNC33:%[0-9]+]]:_(s1) = G_TRUNC [[COPY33]](s32)
+; GFX9-NEXT:    [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr21
+; GFX9-NEXT:    [[TRUNC34:%[0-9]+]]:_(s1) = G_TRUNC [[COPY34]](s32)
+; GFX9-NEXT:    [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr22
+; GFX9-NEXT:    [[TRUNC35:%[0-9]+]]:_(s1) = G_TRUNC [[COPY35]](s32)
+; GFX9-NEXT:    [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr23
+; GFX9-NEXT:    [[TRUNC36:%[0-9]+]]:_(s1) = G_TRUNC [[COPY36]](s32)
+; GFX9-NEXT:    [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr24
+; GFX9-NEXT:    [[TRUNC37:%[0-9]+]]:_(s1) = G_TRUNC [[COPY37]](s32)
+; GFX9-NEXT:    [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr25
+; GFX9-NEXT:    [[TRUNC38:%[0-9]+]]:_(s1) = G_TRUNC [[COPY38]](s32)
+; GFX9-NEXT:    [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr26
+; GFX9-NEXT:    [[TRUNC39:%[0-9]+]]:_(s1) = G_TRUNC [[COPY39]](s32)
+; GFX9-NEXT:    [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr27
+; GFX9-NEXT:    [[TRUNC40:%[0-9]+]]:_(s1) = G_TRUNC [[COPY40]](s32)
+; GFX9-NEXT:    [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr28
+; GFX9-NEXT:    [[TRUNC41:%[0-9]+]]:_(s1) = G_TRUNC [[COPY41]](s32)
+; GFX9-NEXT:    [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr29
+; GFX9-NEXT:    [[TRUNC42:%[0-9]+]]:_(s1) = G_TRUNC [[COPY42]](s32)
+; GFX9-NEXT:    [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr30
+; GFX9-NEXT:    [[TRUNC43:%[0-9]+]]:_(s1) = G_TRUNC [[COPY43]](s32)
+; GFX9-NEXT:    [[FRAME1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
+; GFX9-NEXT:    [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME1]](p5) :: (invariant load (s1) from %fixed-stack.3, align 16, addrspace 5)
+; GFX9-NEXT:    [[TRUNC44:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
+; GFX9-NEXT:    [[FRAME2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+; GFX9-NEXT:    [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME2]](p5) :: (invariant load (s1) from %fixed-stack.2, align 4, addrspace 5)
+; GFX9-NEXT:    [[TRUNC45:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD2]](s32)
+; GFX9-NEXT:    [[FRAME3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+; GFX9-NEXT:    [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME3]](p5) :: (invariant load (s1) from %fixed-stack.1, align 8, addrspace 5)
+; GFX9-NEXT:    [[TRUNC46:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD3]](s32)
+; GFX9-NEXT:    [[FRAME4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+; GFX9-NEXT:    [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME4]](p5) :: (invariant load (s1) from %fixed-stack.0, align 4, addrspace 5)
+; GFX9-NEXT:    [[TRUNC47:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD4]](s32)
+;
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[CONST1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT:    [[PTRADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST1]](s64)
+; GFX9-NEXT:    G_STORE [[COPY1]](s1), [[PTRADD1]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+;
+; G_STOREs to COPY2-COPY12, TRUNC13-TRUNC46 omitted
+; GFX9:         [[CONST47:%[0-9]+]]:_(s64) = G_CONSTANT i64 47
+; GFX9-NEXT:    [[PTRADD47:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST47]](s64)
+; GFX9-NEXT:    G_STORE [[TRUNC47]](s1), [[PTRADD47]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 47, addrspace 1)
+
+  store [48 x i1] %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_a64i1([64 x i1] %arg0) {
+; GFX11-LABEL: name: void_func_a64i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr2
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY $sgpr3
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY $sgpr4
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY $sgpr5
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY $sgpr6
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY $sgpr7
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY $sgpr8
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY $sgpr9
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY $sgpr10
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY $sgpr11
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY $sgpr12
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY $sgpr13
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY $sgpr14
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY $sgpr15
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY $sgpr16
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY $sgpr17
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY $sgpr18
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY $sgpr19
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY $sgpr20
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY $sgpr21
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY $sgpr22
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY $sgpr23
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY $sgpr24
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:sreg_32(s1) = COPY $sgpr25
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:sreg_32(s1) = COPY $sgpr26
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:sreg_32(s1) = COPY $sgpr27
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:sreg_32(s1) = COPY $sgpr28
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:sreg_32(s1) = COPY $sgpr29
+; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+; GFX11-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr2
+; GFX11-NEXT:   [[TRUNC32:%[0-9]+]]:_(s1) = G_TRUNC [[COPY32]](s32)
+; GFX11-NEXT:   [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr3
+; GFX11-NEXT:   [[TRUNC33:%[0-9]+]]:_(s1) = G_TRUNC [[COPY33]](s32)
+; GFX11-NEXT:   [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr4
+; GFX11-NEXT:   [[TRUNC34:%[0-9]+]]:_(s1) = G_TRUNC [[COPY34]](s32)
+; GFX11-NEXT:   [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr5
+; GFX11-NEXT:   [[TRUNC35:%[0-9]+]]:_(s1) = G_TRUNC [[COPY35]](s32)
+; GFX11-NEXT:   [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr6
+; GFX11-NEXT:   [[TRUNC36:%[0-9]+]]:_(s1) = G_TRUNC [[COPY36]](s32)
+; GFX11-NEXT:   [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr7
+; GFX11-NEXT:   [[TRUNC37:%[0-9]+]]:_(s1) = G_TRUNC [[COPY37]](s32)
+; GFX11-NEXT:   [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr8
+; GFX11-NEXT:   [[TRUNC38:%[0-9]+]]:_(s1) = G_TRUNC [[COPY38]](s32)
+; GFX11-NEXT:   [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr9
+; GFX11-NEXT:   [[TRUNC39:%[0-9]+]]:_(s1) = G_TRUNC [[COPY39]](s32)
+; GFX11-NEXT:   [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr10
+; GFX11-NEXT:   [[TRUNC40:%[0-9]+]]:_(s1) = G_TRUNC [[COPY40]](s32)
+; GFX11-NEXT:   [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr11
+; GFX11-NEXT:   [[TRUNC41:%[0-9]+]]:_(s1) = G_TRUNC [[COPY41]](s32)
+; GFX11-NEXT:   [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr12
+; GFX11-NEXT:   [[TRUNC42:%[0-9]+]]:_(s1) = G_TRUNC [[COPY42]](s32)
+; GFX11-NEXT:   [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr13
+; GFX11-NEXT:   [[TRUNC43:%[0-9]+]]:_(s1) = G_TRUNC [[COPY43]](s32)
+; GFX11-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY $vgpr14
+; GFX11-NEXT:   [[TRUNC44:%[0-9]+]]:_(s1) = G_TRUNC [[COPY44]](s32)
+; GFX11-NEXT:   [[COPY45:%[0-9]+]]:_(s32) = COPY $vgpr15
+; GFX11-NEXT:   [[TRUNC45:%[0-9]+]]:_(s1) = G_TRUNC [[COPY45]](s32)
+; GFX11-NEXT:   [[COPY46:%[0-9]+]]:_(s32) = COPY $vgpr16
+; GFX11-NEXT:   [[TRUNC46:%[0-9]+]]:_(s1) = G_TRUNC [[COPY46]](s32)
+; GFX11-NEXT:   [[COPY47:%[0-9]+]]:_(s32) = COPY $vgpr17
+; GFX11-NEXT:   [[TRUNC47:%[0-9]+]]:_(s1) = G_TRUNC [[COPY47]](s32)
+; GFX11-NEXT:   [[COPY48:%[0-9]+]]:_(s32) = COPY $vgpr18
+; GFX11-NEXT:   [[TRUNC48:%[0-9]+]]:_(s1) = G_TRUNC [[COPY48]](s32)
+; GFX11-NEXT:   [[COPY49:%[0-9]+]]:_(s32) = COPY $vgpr19
+; GFX11-NEXT:   [[TRUNC49:%[0-9]+]]:_(s1) = G_TRUNC [[COPY49]](s32)
+; GFX11-NEXT:   [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr20
+; GFX11-NEXT:   [[TRUNC50:%[0-9]+]]:_(s1) = G_TRUNC [[COPY50]](s32)
+; GFX11-NEXT:   [[COPY51:%[0-9]+]]:_(s32) = COPY $vgpr21
+; GFX11-NEXT:   [[TRUNC51:%[0-9]+]]:_(s1) = G_TRUNC [[COPY51]](s32)
+; GFX11-NEXT:   [[COPY52:%[0-9]+]]:_(s32) = COPY $vgpr22
+; GFX11-NEXT:   [[TRUNC52:%[0-9]+]]:_(s1) = G_TRUNC [[COPY52]](s32)
+; GFX11-NEXT:   [[COPY53:%[0-9]+]]:_(s32) = COPY $vgpr23
+; GFX11-NEXT:   [[TRUNC53:%[0-9]+]]:_(s1) = G_TRUNC [[COPY53]](s32)
+; GFX11-NEXT:   [[COPY54:%[0-9]+]]:_(s32) = COPY $vgpr24
+; GFX11-NEXT:   [[TRUNC54:%[0-9]+]]:_(s1) = G_TRUNC [[COPY54]](s32)
+; GFX11-NEXT:   [[COPY55:%[0-9]+]]:_(s32) = COPY $vgpr25
+; GFX11-NEXT:   [[TRUNC55:%[0-9]+]]:_(s1) = G_TRUNC [[COPY55]](s32)
+; GFX11-NEXT:   [[COPY56:%[0-9]+]]:_(s32) = COPY $vgpr26
+; GFX11-NEXT:   [[TRUNC56:%[0-9]+]]:_(s1) = G_TRUNC [[COPY56]](s32)
+; GFX11-NEXT:   [[COPY57:%[0-9]+]]:_(s32) = COPY $vgpr27
+; GFX11-NEXT:   [[TRUNC57:%[0-9]+]]:_(s1) = G_TRUNC [[COPY57]](s32)
+; GFX11-NEXT:   [[COPY58:%[0-9]+]]:_(s32) = COPY $vgpr28
+; GFX11-NEXT:   [[TRUNC58:%[0-9]+]]:_(s1) = G_TRUNC [[COPY58]](s32)
+; GFX11-NEXT:   [[COPY59:%[0-9]+]]:_(s32) = COPY $vgpr29
+; GFX11-NEXT:   [[TRUNC59:%[0-9]+]]:_(s1) = G_TRUNC [[COPY59]](s32)
+; GFX11-NEXT:   [[COPY60:%[0-9]+]]:_(s32) = COPY $vgpr30
+; GFX11-NEXT:   [[TRUNC60:%[0-9]+]]:_(s1) = G_TRUNC [[COPY60]](s32)
+
+; GFX11-NEXT:   [[FRAME0:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+; GFX11-NEXT:   [[LOAD0:%[0-9]+]]:_(s32) = G_LOAD [[FRAME0]](p5) :: (invariant load (s1) from %fixed-stack.2, align 16, addrspace 5)
+; GFX11-NEXT:   [[TRUNC61:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD0]](s32)
+; GFX11-NEXT:   [[FRAME1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+; GFX11-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME1]](p5) :: (invariant load (s1) from %fixed-stack.1, align 4, addrspace 5)
+; GFX11-NEXT:   [[TRUNC62:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
+
+; GFX11-NEXT:   [[FRAME2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+; GFX11-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME2]](p5) :: (invariant load (s1) from %fixed-stack.0, align 8, addrspace 5)
+; GFX11-NEXT:   [[TRUNC63:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD2]](s32)
+; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+;
+; GFX11-NEXT:   G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:   [[CONST1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX11-NEXT:   [[PTRADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST1]]
+; GFX11-NEXT:   G_STORE [[COPY1]](s1), [[PTRADD1]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+;
+; GFX11:        [[CONST63:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+; GFX11-NEXT:   [[PTRADD63:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST63]]
+; GFX11-NEXT:   G_STORE [[TRUNC63]](s1), [[PTRADD63]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 63, addrspace 1)
+
+  store [64 x i1] %arg0, ptr addrspace(1) undef
+  ret void
+}
+
 define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
 ; GFX9-LABEL: name: void_func_i1_i1_inreg
 ; GFX9: bb.1 (%ir-block.0):
@@ -475,6 +706,41 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
   ret void
 }
 
+define void @test_call_void_func_i1_i1_inreg() {
+; GFX9-LABEL: name: test_call_void_func_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_i1_inreg]]
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX9-NEXT:    $sgpr6 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_i1_inreg]]
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX11-NEXT:    $sgpr1 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_i1_inreg(i1 %val, i1 inreg true)
+  ret void
+}
+
 define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
 ; GFX9-LABEL: name: void_func_i1_inreg_i1
 ; GFX9: bb.1 (%ir-block.0):
@@ -504,3 +770,225 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
   ret void
 }
 
+define void @test_call_void_func_i1_inreg_i1() {
+; GFX9-LABEL: name: test_call_void_func_i1_inreg_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_i1]]
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr4 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT:    $sgpr6_sgpr7 = COPY [[CONST]](s1)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_inreg_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_i1]]
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr1 = COPY [[CONST]](s1)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_inreg_i1(i1 inreg %val, i1 true)
+  ret void
+}
+
+define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: name: void_func_zeroext_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr6, $sgpr4_sgpr5
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_zeroext_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_zeroext_i1_i1_inreg() {
+; GFX9-LABEL: name: test_call_void_func_zeroext_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_zeroext_i1_i1_inreg]]
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX9-NEXT:    $sgpr6 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_zeroext_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_zeroext_i1_i1_inreg]]
+; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX11-NEXT:    $sgpr1 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_zeroext_i1_i1_inreg(i1 zeroext %val, i1 inreg true)
+  ret void
+}
+
+define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
+; GFX9-LABEL: name: void_func_i1_inreg_zeroext_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr4, $sgpr6_sgpr7
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_inreg_zeroext_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_i1_inreg_zeroext_i1() {
+; GFX9-LABEL: name: test_call_void_func_i1_inreg_zeroext_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_inreg_zeroext_i1
+; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr4 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT:    $sgpr6_sgpr7 = COPY [[CONST]](s1)
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_inreg_zeroext_i1, csr_amdgpu, implicit $sgpr4, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_inreg_zeroext_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_zeroext_i1]]
+; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT:    $sgpr1 = COPY [[CONST]](s1)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_inreg_zeroext_i1(i1 inreg %val, i1 zeroext true)
+  ret void
+}
+
+define void @void_func_signext_i1_i1_inreg(i1 signext %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: name: void_func_signext_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr6, $sgpr4_sgpr5
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_signext_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i1_inreg_signext_i1(i1 inreg %arg0, i1 signext %arg1) {
+; GFX9-LABEL: name: void_func_i1_inreg_signext_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $sgpr4, $sgpr6_sgpr7
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_inreg_signext_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index 252afe1712464..0fa5418962763 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -8,7 +8,7 @@ define i1 @i1_func_void() #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[LOAD]](s1)
   ; CHECK-NEXT:   SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
@@ -19,7 +19,7 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[LOAD]](s1)
   ; CHECK-NEXT:   SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
@@ -30,7 +30,7 @@ define signext i1 @i1_signext_func_void() #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[LOAD]](s1)
   ; CHECK-NEXT:   SI_RETURN
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index d0a17bc48c185..6bec8ac074239 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -5306,7 +5306,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[ANYEXT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5318,7 +5318,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i16_inreg(i16 inreg %arg)
@@ -5351,7 +5351,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[COPY9]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[COPY9]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5363,7 +5363,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i32_inreg(i32 inreg %arg)
@@ -5399,8 +5399,8 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5412,7 +5412,7 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i64_inreg(i64 inreg %arg)
@@ -5448,8 +5448,8 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5461,7 +5461,7 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2i32_inreg(<2 x i32> inreg %arg)
@@ -5496,7 +5496,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[ANYEXT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5508,7 +5508,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f16_inreg(half inreg %arg)
@@ -5543,7 +5543,7 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[ANYEXT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5555,7 +5555,7 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_bf16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_bf16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_bf16_inreg(bfloat inreg %arg)
@@ -5588,7 +5588,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[COPY9]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[COPY9]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5600,7 +5600,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f32_inreg(float inreg %arg)
@@ -5636,8 +5636,8 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5649,7 +5649,7 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f64_inreg(double inreg %arg)
@@ -5682,7 +5682,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[COPY9]](<2 x s16>)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[COPY9]](<2 x s16>)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5694,7 +5694,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2f16_inreg(<2 x half> inreg %arg)
@@ -5735,8 +5735,8 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF]](s16)
   ; CHECK-NEXT:   [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s16>)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV7]](<2 x s16>)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV8]](<2 x s16>)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV7]](<2 x s16>)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV8]](<2 x s16>)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5748,7 +5748,7 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v3f16_inreg(<3 x half> inreg %arg)
@@ -5784,8 +5784,8 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV]](<2 x s16>)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV1]](<2 x s16>)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV]](<2 x s16>)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV1]](<2 x s16>)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5797,7 +5797,7 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v4f16_inreg(<4 x half> inreg %arg)
@@ -5833,8 +5833,8 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p0)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5846,7 +5846,7 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p0_inreg(ptr inreg %arg)
@@ -5882,8 +5882,8 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5895,7 +5895,7 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
@@ -5928,7 +5928,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[COPY9]](p3)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[COPY9]](p3)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5940,7 +5940,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
@@ -5980,10 +5980,10 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p1>)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $sgpr3 = COPY [[UV3]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   $sgpr18 = COPY [[UV2]](s32)
+  ; CHECK-NEXT:   $sgpr19 = COPY [[UV3]](s32)
   ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY13]](p4)
@@ -5995,7 +5995,7 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY19]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY20]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY21]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg)
@@ -6031,8 +6031,8 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p5>)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -6044,7 +6044,7 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index a8a6f1954edd1..b8758a72998e2 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -26343,31 +26343,31 @@ define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_fcmp_false_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_false_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_false_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_false_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_false_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_false_bf16:
@@ -26387,7 +26387,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_oeq_bf16:
@@ -26397,7 +26397,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_eq_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_oeq_bf16:
@@ -26405,7 +26405,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_oeq_bf16:
@@ -26413,7 +26413,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_oeq_bf16:
@@ -26421,7 +26421,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_oeq_bf16:
@@ -26444,7 +26444,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ogt_bf16:
@@ -26454,7 +26454,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ogt_bf16:
@@ -26462,7 +26462,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ogt_bf16:
@@ -26470,7 +26470,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ogt_bf16:
@@ -26478,7 +26478,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ogt_bf16:
@@ -26501,7 +26501,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_oge_bf16:
@@ -26511,7 +26511,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_ge_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_oge_bf16:
@@ -26519,7 +26519,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_oge_bf16:
@@ -26527,7 +26527,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_oge_bf16:
@@ -26535,7 +26535,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_oge_bf16:
@@ -26558,7 +26558,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_olt_bf16:
@@ -26568,7 +26568,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_olt_bf16:
@@ -26576,7 +26576,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_olt_bf16:
@@ -26584,7 +26584,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_olt_bf16:
@@ -26592,7 +26592,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_olt_bf16:
@@ -26615,7 +26615,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_le_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ole_bf16:
@@ -26625,7 +26625,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_le_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ole_bf16:
@@ -26633,7 +26633,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_le_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ole_bf16:
@@ -26641,7 +26641,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_le_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ole_bf16:
@@ -26649,7 +26649,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_le_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_le_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ole_bf16:
@@ -26672,7 +26672,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_lg_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_one_bf16:
@@ -26682,7 +26682,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_lg_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_one_bf16:
@@ -26690,7 +26690,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_lg_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_one_bf16:
@@ -26698,7 +26698,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_lg_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_one_bf16:
@@ -26706,7 +26706,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_lg_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_lg_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_one_bf16:
@@ -26729,7 +26729,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_uno_bf16:
@@ -26739,7 +26739,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_uno_bf16:
@@ -26747,7 +26747,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_uno_bf16:
@@ -26755,7 +26755,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_uno_bf16:
@@ -26763,7 +26763,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_u_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_uno_bf16:
@@ -26786,7 +26786,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_nlg_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ueq_bf16:
@@ -26796,7 +26796,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_nlg_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ueq_bf16:
@@ -26804,7 +26804,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_nlg_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ueq_bf16:
@@ -26812,7 +26812,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_nlg_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ueq_bf16:
@@ -26820,7 +26820,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nlg_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_nlg_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ueq_bf16:
@@ -26843,7 +26843,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_nle_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ugt_bf16:
@@ -26853,7 +26853,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_nle_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ugt_bf16:
@@ -26861,7 +26861,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_nle_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ugt_bf16:
@@ -26869,7 +26869,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_nle_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ugt_bf16:
@@ -26877,7 +26877,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nle_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_nle_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ugt_bf16:
@@ -26900,7 +26900,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_nlt_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_uge_bf16:
@@ -26910,7 +26910,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_nlt_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_uge_bf16:
@@ -26918,7 +26918,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_nlt_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_uge_bf16:
@@ -26926,7 +26926,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_nlt_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_uge_bf16:
@@ -26934,7 +26934,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_nlt_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_uge_bf16:
@@ -26957,7 +26957,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_nge_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ult_bf16:
@@ -26967,7 +26967,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_nge_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ult_bf16:
@@ -26975,7 +26975,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_nge_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ult_bf16:
@@ -26983,7 +26983,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_nge_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ult_bf16:
@@ -26991,7 +26991,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nge_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_nge_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ult_bf16:
@@ -27014,7 +27014,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_ngt_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ule_bf16:
@@ -27024,7 +27024,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_ngt_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ule_bf16:
@@ -27032,7 +27032,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_ngt_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ule_bf16:
@@ -27040,7 +27040,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ule_bf16:
@@ -27048,7 +27048,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ule_bf16:
@@ -27071,7 +27071,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
+; GCN-NEXT:    v_cmp_neq_f32_e64 s[4:5], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_une_bf16:
@@ -27081,7 +27081,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT:    v_cmp_neq_f32_e64 s[4:5], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_une_bf16:
@@ -27089,7 +27089,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT:    v_cmp_neq_f32_e64 s[4:5], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_une_bf16:
@@ -27097,7 +27097,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT:    v_cmp_neq_f32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_une_bf16:
@@ -27105,7 +27105,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_neq_f32_e64 s0, v0, v1
+; GFX10-NEXT:    v_cmp_neq_f32_e64 s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_une_bf16:
@@ -27124,31 +27124,31 @@ define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_fcmp_true_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[0:1], -1
+; GCN-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_true_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[0:1], -1
+; GFX7-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_true_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b64 s[0:1], -1
+; GFX8-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_true_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b64 s[0:1], -1
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_true_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, -1
+; GFX10-NEXT:    s_mov_b32 s4, -1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_true_bf16:
@@ -33476,6 +33476,8 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -33483,6 +33485,8 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX7-LABEL: v_select_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -33518,7 +33522,8 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_fneg_lhs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -33526,7 +33531,8 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX7-LABEL: v_select_fneg_lhs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -33568,7 +33574,8 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_fneg_rhs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -33576,7 +33583,8 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX7-LABEL: v_select_fneg_rhs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, -1.0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -33618,6 +33626,10 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GCN-LABEL: v_select_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -33631,6 +33643,10 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX7-LABEL: v_select_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -34062,6 +34078,12 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GCN-LABEL: v_select_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -34078,9 +34100,15 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX7-LABEL: v_select_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v3, 16
@@ -34126,6 +34154,14 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GCN-LABEL: v_select_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -34145,14 +34181,22 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX7-LABEL: v_select_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v7
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -34196,6 +34240,18 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GCN-LABEL: v_select_v6bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -34222,18 +34278,30 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX7-LABEL: v_select_v6bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v6, 16
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX7-NEXT:    v_alignbit_b32 v5, v5, v10, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v8
+; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v11
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v10
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v6, 16
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
@@ -34284,6 +34352,22 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GCN-LABEL: v_select_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -34317,22 +34401,38 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX7-LABEL: v_select_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v10
 ; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v12
 ; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; GFX7-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v10, 16
-; GFX7-NEXT:    v_alignbit_b32 v5, v5, v12, 16
-; GFX7-NEXT:    v_alignbit_b32 v7, v7, v14, 16
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v15
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v14
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
@@ -34390,44 +34490,77 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GCN-LABEL: v_select_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v17
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GCN-NEXT:    v_alignbit_b32 v3, v3, v18, 16
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GCN-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
-; GCN-NEXT:    v_alignbit_b32 v5, v5, v20, 16
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GCN-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
-; GCN-NEXT:    v_alignbit_b32 v7, v7, v22, 16
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v22
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GCN-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
-; GCN-NEXT:    v_alignbit_b32 v9, v9, v24, 16
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v25
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v24
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v27
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v29
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v28
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v27
+; GCN-NEXT:    v_alignbit_b32 v10, v11, v10, 16
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v30
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
 ; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GCN-NEXT:    v_alignbit_b32 v11, v16, v26, 16
+; GCN-NEXT:    v_alignbit_b32 v16, v16, v17, 16
 ; GCN-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v17, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v13, v18, v19, 16
 ; GCN-NEXT:    v_alignbit_b32 v14, v15, v14, 16
 ; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v12, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v12, v16, v10, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v8, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v11
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -34438,13 +34571,12 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v12
 ; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
-; GCN-NEXT:    v_alignbit_b32 v15, v15, v30, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_alignbit_b32 v15, v15, v20, 16
 ; GCN-NEXT:    v_cndmask_b32_e64 v15, v15, v14, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -34453,39 +34585,69 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX7-LABEL: v_select_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX7-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v17
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v19
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v27
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v29
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v18
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX7-NEXT:    v_alignbit_b32 v16, v16, v26, 16
-; GFX7-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; GFX7-NEXT:    v_alignbit_b32 v13, v17, v28, 16
-; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v21
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v20
 ; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v23
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v17, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v22
 ; GFX7-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v25
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v17, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v24
+; GFX7-NEXT:    v_alignbit_b32 v10, v11, v10, 16
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v27
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v12, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e64 v12, v16, v10, s[4:5]
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v18, 16
-; GFX7-NEXT:    v_alignbit_b32 v5, v5, v20, 16
-; GFX7-NEXT:    v_alignbit_b32 v7, v7, v22, 16
-; GFX7-NEXT:    v_alignbit_b32 v9, v9, v24, 16
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_alignbit_b32 v9, v9, v17, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v26
+; GFX7-NEXT:    v_alignbit_b32 v12, v13, v12, 16
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v29
 ; GFX7-NEXT:    v_alignbit_b32 v14, v15, v14, 16
+; GFX7-NEXT:    v_alignbit_b32 v11, v11, v17, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v28
+; GFX7-NEXT:    v_alignbit_b32 v13, v13, v17, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v8, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
@@ -34501,14 +34663,16 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX7-NEXT:    v_alignbit_b32 v10, v10, v30, 16
-; GFX7-NEXT:    v_cndmask_b32_e64 v15, v10, v14, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v30
+; GFX7-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v14, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -34572,136 +34736,200 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GCN-LABEL: v_select_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GCN-NEXT:    v_alignbit_b32 v3, v3, v6, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
-; GCN-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
-; GCN-NEXT:    v_alignbit_b32 v5, v5, v10, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
-; GCN-NEXT:    v_alignbit_b32 v6, v6, v12, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; GCN-NEXT:    v_alignbit_b32 v7, v7, v14, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
-; GCN-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
-; GCN-NEXT:    v_alignbit_b32 v9, v9, v18, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v21
-; GCN-NEXT:    v_alignbit_b32 v10, v10, v20, 16
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_alignbit_b32 v2, v2, v3, 16
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v12
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v14
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v17
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v16
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v9, 16
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v18
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v10, 16
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v20
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    v_alignbit_b32 v10, v10, v11, 16
 ; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
-; GCN-NEXT:    v_alignbit_b32 v11, v11, v22, 16
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v22
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v12, 16
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v25
-; GCN-NEXT:    v_alignbit_b32 v12, v12, v24, 16
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v25
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v24
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_alignbit_b32 v12, v12, v13, 16
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:16
-; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v27
-; GCN-NEXT:    v_alignbit_b32 v13, v13, v26, 16
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v27
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v26
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v13, v13, v14, 16
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v29
-; GCN-NEXT:    v_alignbit_b32 v14, v14, v28, 16
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v29
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v28
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_alignbit_b32 v14, v14, v19, 16
 ; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:24
 ; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GCN-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v17
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v17
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v18
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT:    v_alignbit_b32 v16, v16, v17, 16
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v19
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT:    v_alignbit_b32 v17, v17, v19, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_alignbit_b32 v17, v17, v20, 16
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v20
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:40
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36
-; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GCN-NEXT:    v_alignbit_b32 v18, v18, v19, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:48
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v19, v19, v20, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_alignbit_b32 v19, v19, v21, 16
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v22
 ; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:52
-; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; GCN-NEXT:    v_alignbit_b32 v20, v20, v21, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v23
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:64
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v21, v21, v22, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_alignbit_b32 v21, v21, v23, 16
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v24
 ; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
 ; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
-; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
 ; GCN-NEXT:    v_alignbit_b32 v22, v22, v23, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v24
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v25
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT:    v_alignbit_b32 v23, v23, v24, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_alignbit_b32 v23, v23, v25, 16
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v26
 ; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
 ; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:84
-; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
 ; GCN-NEXT:    v_alignbit_b32 v24, v24, v25, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v26
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v27
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT:    v_alignbit_b32 v25, v25, v26, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_alignbit_b32 v25, v25, v27, 16
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v28
 ; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:104
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:100
-; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; GCN-NEXT:    v_alignbit_b32 v26, v26, v27, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v28
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v29
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT:    v_alignbit_b32 v27, v27, v28, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_alignbit_b32 v27, v27, v29, 16
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v31
 ; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
 ; GCN-NEXT:    v_alignbit_b32 v28, v28, v29, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v31
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_alignbit_b32 v29, v29, v32, 16
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT:    v_alignbit_b32 v29, v29, v31, 16
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v32
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
-; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
 ; GCN-NEXT:    v_alignbit_b32 v30, v31, v30, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_alignbit_b32 v31, v31, v33, 16
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:    v_alignbit_b32 v31, v31, v32, 16
 ; GCN-NEXT:    v_cndmask_b32_e64 v31, v31, v30, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v29, v29, v14, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v28, v28, v13, s[4:5]
@@ -34755,160 +34983,227 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX7-LABEL: v_select_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    v_alignbit_b32 v5, v5, v10, 16
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v12, 16
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_alignbit_b32 v7, v7, v14, 16
-; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
-; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX7-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX7-NEXT:    v_alignbit_b32 v24, v25, v24, 16
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v3, 16
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_alignbit_b32 v16, v17, v16, 16
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_alignbit_b32 v12, v13, v12, 16
+; GFX7-NEXT:    v_alignbit_b32 v26, v27, v26, 16
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_alignbit_b32 v10, v11, v10, 16
+; GFX7-NEXT:    v_alignbit_b32 v22, v23, v22, 16
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_alignbit_b32 v14, v15, v14, 16
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_alignbit_b32 v18, v19, v18, 16
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GFX7-NEXT:    v_alignbit_b32 v20, v21, v20, 16
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_alignbit_b32 v24, v25, v24, 16
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GFX7-NEXT:    v_alignbit_b32 v28, v29, v28, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX7-NEXT:    v_alignbit_b32 v18, v19, v18, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX7-NEXT:    v_alignbit_b32 v22, v23, v22, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX7-NEXT:    v_alignbit_b32 v26, v27, v26, 16
-; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
-; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104
-; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
 ; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104
 ; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:120
-; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
-; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
-; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    s_waitcnt vmcnt(11)
-; GFX7-NEXT:    v_alignbit_b32 v9, v9, v10, 16
-; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(9)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX7-NEXT:    s_waitcnt vmcnt(7)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v6, 16
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(11)
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GFX7-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    v_alignbit_b32 v10, v10, v11, 16
-; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX7-NEXT:    v_alignbit_b32 v11, v11, v12, 16
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    v_alignbit_b32 v12, v12, v13, 16
-; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX7-NEXT:    v_alignbit_b32 v13, v13, v14, 16
-; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v4, s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    v_alignbit_b32 v14, v14, v15, 16
-; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v5, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e64 v5, v11, v2, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_alignbit_b32 v6, v6, v7, 16
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_alignbit_b32 v15, v15, v16, 16
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v6, s[4:5]
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v2, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT:    v_alignbit_b32 v16, v16, v17, 16
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
-; GFX7-NEXT:    v_cndmask_b32_e64 v16, v16, v7, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e64 v7, v12, v3, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v10, v1, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v0, s[4:5]
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    v_alignbit_b32 v8, v8, v9, 16
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v8, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v9, v9, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v4, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_alignbit_b32 v11, v11, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_alignbit_b32 v13, v13, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_alignbit_b32 v15, v15, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v14, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_alignbit_b32 v17, v17, v31, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    v_cndmask_b32_e64 v17, v17, v8, s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v15
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v17, v16, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_alignbit_b32 v19, v19, v31, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
 ; GFX7-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_alignbit_b32 v21, v21, v31, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
 ; GFX7-NEXT:    v_cndmask_b32_e64 v21, v21, v20, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_alignbit_b32 v23, v23, v31, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
 ; GFX7-NEXT:    v_cndmask_b32_e64 v23, v23, v22, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_alignbit_b32 v25, v25, v31, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
 ; GFX7-NEXT:    v_cndmask_b32_e64 v25, v25, v24, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_alignbit_b32 v27, v27, v31, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; GFX7-NEXT:    v_cndmask_b32_e64 v27, v27, v26, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_alignbit_b32 v29, v29, v31, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v28, s[4:5]
@@ -34920,6 +35215,7 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX7-NEXT:    v_alignbit_b32 v30, v31, v30, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
 ; GFX7-NEXT:    v_alignbit_b32 v31, v31, v32, 16
 ; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v30, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index bcdfb75ab1ef9..0b9b37f85a755 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -193,24 +193,22 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
 ; DAGISEL-ASM-LABEL: recursive_phis:
 ; DAGISEL-ASM:       ; %bb.0: ; %entry
 ; DAGISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT:    v_and_b32_e32 v0, 1, v0
-; DAGISEL-ASM-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; DAGISEL-ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; DAGISEL-ASM-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; DAGISEL-ASM-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; DAGISEL-ASM-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; DAGISEL-ASM-NEXT:  ; %bb.1: ; %then
-; DAGISEL-ASM-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; DAGISEL-ASM-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; DAGISEL-ASM-NEXT:  ; %bb.2: ; %finallyendcf.split
-; DAGISEL-ASM-NEXT:    s_or_b64 exec, exec, s[4:5]
-; DAGISEL-ASM-NEXT:    s_xor_b64 s[6:7], vcc, -1
+; DAGISEL-ASM-NEXT:    s_or_b64 exec, exec, s[6:7]
+; DAGISEL-ASM-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; DAGISEL-ASM-NEXT:    s_mov_b64 s[4:5], 0
 ; DAGISEL-ASM-NEXT:    s_mov_b64 s[8:9], src_private_base
-; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v0, 7
 ; DAGISEL-ASM-NEXT:  .LBB7_3: ; %finally
 ; DAGISEL-ASM-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DAGISEL-ASM-NEXT:    s_and_b64 s[10:11], exec, s[6:7]
 ; DAGISEL-ASM-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v1, s9
-; DAGISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v2, s9
+; DAGISEL-ASM-NEXT:    flat_store_dword v[1:2], v0
 ; DAGISEL-ASM-NEXT:    s_waitcnt vmcnt(0)
 ; DAGISEL-ASM-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; DAGISEL-ASM-NEXT:    s_cbranch_execnz .LBB7_3
@@ -222,29 +220,27 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
 ; GISEL-ASM-LABEL: recursive_phis:
 ; GISEL-ASM:       ; %bb.0: ; %entry
 ; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-ASM-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-ASM-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GISEL-ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GISEL-ASM-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-ASM-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-ASM-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GISEL-ASM-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GISEL-ASM-NEXT:  ; %bb.1: ; %then
-; GISEL-ASM-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GISEL-ASM-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GISEL-ASM-NEXT:  ; %bb.2: ; %finallyendcf.split
-; GISEL-ASM-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GISEL-ASM-NEXT:    s_mov_b64 s[8:9], src_private_base
-; GISEL-ASM-NEXT:    s_mov_b64 s[6:7], 0
-; GISEL-ASM-NEXT:    v_mov_b32_e32 v1, s9
-; GISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v2, s9
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v0, 7
 ; GISEL-ASM-NEXT:  .LBB7_3: ; %finally
 ; GISEL-ASM-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GISEL-ASM-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
-; GISEL-ASM-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
+; GISEL-ASM-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GISEL-ASM-NEXT:    flat_store_dword v[1:2], v0
 ; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-ASM-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GISEL-ASM-NEXT:    s_cbranch_execnz .LBB7_3
 ; GISEL-ASM-NEXT:  ; %bb.4: ; %end
-; GISEL-ASM-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GISEL-ASM-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-ASM-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 53448df79ee27..1931058d75a99 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -43,8 +43,8 @@ define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) {
   ; GCN-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_BFE_I32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
-  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
-  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr4_sgpr5
   %setcc = icmp slt i16 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
@@ -92,8 +92,8 @@ define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) {
   ; GCN-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY1]], killed [[S_MOV_B32_]], implicit $exec
   ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
-  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
-  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr4_sgpr5
   %setcc = icmp slt i32 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
@@ -148,8 +148,8 @@ define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
   ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE]], [[COPY3]], implicit $exec
   ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], [[COPY]], implicit-def dead $scc
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
-  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
-  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr4_sgpr5
   %setcc = icmp slt i64 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
diff --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 02a3066822e51..d6c7d686976f6 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -30,7 +30,7 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
 ; CHECK-NEXT:    buffer_load_ubyte v0, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %val = load <8 x i1>, ptr %ptr
   %ret = extractelement <8 x i1> %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 7c5f6d5e33efe..67d6600b44483 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -600,10 +600,9 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
 ; VI-LABEL: fmul_pow_select:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 1, v1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; VI-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 2
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, s4
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -611,10 +610,9 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
 ; GFX10-LABEL: fmul_pow_select:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s4
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -622,13 +620,12 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
 ; GFX11-LABEL: fmul_pow_select:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl2 = shl nuw i32 2, %cnt
diff --git a/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll b/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
index 85286841cbcac..d9f062863495a 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
@@ -666,18 +666,14 @@ define float @fold_f32_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, float
 ; SDAG-LABEL: fold_f32_select_user_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_select_user_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = select i1 %cond, float %sub, float %v1
@@ -688,19 +684,15 @@ define float @no_fold_f32_select_user_fsub_into_fneg_modifier_daz(i1 %cond, floa
 ; SDAG-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = select i1 %cond, float %sub, float %v1
@@ -711,19 +703,15 @@ define float @no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond,
 ; SDAG-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = select i1 %cond, float %sub, float %v1
@@ -734,19 +722,15 @@ define half @fold_f16_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, half %v
 ; SDAG-LABEL: fold_f16_select_user_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_select_user_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = select i1 %cond, half %sub, half %v1
@@ -757,19 +741,15 @@ define half @no_fold_f16_select_user_fsub_into_fneg_modifier_daz(i1 %cond, half
 ; SDAG-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_sub_f16_e32 v1, 0x8000, v1
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    v_sub_f16_e32 v0, 0x8000, v0
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = select i1 %cond, half %sub, half %v1
@@ -780,19 +760,15 @@ define half @no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond, h
 ; SDAG-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_sub_f16_e32 v1, 0x8000, v1
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    v_sub_f16_e32 v0, 0x8000, v0
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = select i1 %cond, half %sub, half %v1
@@ -803,21 +779,17 @@ define double @fold_f64_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, doubl
 ; SDAG-LABEL: fold_f64_select_user_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f64_select_user_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f64 v[1:2], -v[1:2], -v[1:2]
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = select i1 %cond, double %sub, double %v1
@@ -828,21 +800,17 @@ define double @no_fold_f64_select_user_fsub_into_fneg_modifier_daz(i1 %cond, dou
 ; SDAG-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f64 v[1:2], -v[1:2], -v[1:2]
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = select i1 %cond, double %sub, double %v1
@@ -853,21 +821,17 @@ define double @no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond,
 ; SDAG-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f64 v[1:2], -v[1:2], -v[1:2]
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = select i1 %cond, double %sub, double %v1
@@ -878,19 +842,15 @@ define <2 x half> @fold_v2f16_select_user_fsub_into_fneg_modifier_ieee(i1 %cond,
 ; SDAG-LABEL: fold_v2f16_select_user_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f16_select_user_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -901,19 +861,15 @@ define <2 x half> @no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz(i1 %con
 ; SDAG-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -924,19 +880,15 @@ define <2 x half> @no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic(i1
 ; SDAG-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -984,7 +936,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee(float %v0) #0 {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee:
@@ -992,7 +943,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee(float %v0) #0 {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1005,7 +955,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz(float %v0) #1 {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz:
@@ -1013,7 +962,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz(float %v0) #1 {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1026,7 +974,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic(float %v0) #
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic:
@@ -1034,7 +981,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic(float %v0) #
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1047,7 +993,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee(float %v0)
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0x90
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, v1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee:
@@ -1055,8 +1000,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee(float %v0)
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x90
-; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1069,8 +1013,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz(float %v0) #
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0x90
-; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz:
@@ -1078,8 +1021,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz(float %v0) #
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x90
-; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1092,8 +1034,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic(float %v
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0x90
-; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic:
@@ -1101,8 +1042,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic(float %v
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x90
-; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1114,15 +1054,13 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_ieee(float %v0, i32 %te
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, v1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_ieee:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1134,16 +1072,14 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_daz(float %v0, i32 %tes
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
-; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_daz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1155,16 +1091,14 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_dynamic(float %v0, i32
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
-; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_dynamic:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1176,15 +1110,13 @@ define i1 @no_fold_f64_fsub_into_fneg_modifier_class_var_daz(double %v0, i32 %te
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], -v[0:1], v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f64_fsub_into_fneg_modifier_class_var_daz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v2
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f64(double %sub, i32 %testmask)
@@ -1196,16 +1128,14 @@ define i1 @no_fold_f16_fsub_into_fneg_modifier_class_var_daz(half %v0, i32 %test
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f16_e32 v0, 0x8000, v0
-; SDAG-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f16_fsub_into_fneg_modifier_class_var_daz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GISEL-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f16(half %sub, i32 %testmask)
@@ -1218,7 +1148,6 @@ define i1 @no_fold_f64_fsub_into_fneg_modifier_class_daz(double %v0) #1 {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0x90
 ; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], -v[0:1], v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f64_fsub_into_fneg_modifier_class_daz:
@@ -1226,8 +1155,7 @@ define i1 @no_fold_f64_fsub_into_fneg_modifier_class_daz(double %v0) #1 {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x90
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v2
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f64(double %sub, i32 144)
@@ -1240,8 +1168,7 @@ define i1 @no_fold_f16_fsub_into_fneg_modifier_class_daz(half %v0) #1 {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f16_e32 v0, 0x8000, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0x90
-; SDAG-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f16_fsub_into_fneg_modifier_class_daz:
@@ -1249,8 +1176,7 @@ define i1 @no_fold_f16_fsub_into_fneg_modifier_class_daz(half %v0) #1 {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x90
-; GISEL-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f16(half %sub, i32 144)
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
index 0b3366f71d89c..c91f8cd889c88 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -17,7 +17,7 @@ define i1 @i1_func_void() {
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_func_void:
@@ -57,7 +57,7 @@ define void @test_call_i1_func_void() {
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
@@ -118,7 +118,7 @@ define zeroext i1 @zeroext_i1_func_void() {
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zeroext_i1_func_void:
@@ -158,7 +158,7 @@ define void @test_call_zeroext_i1_func_void() {
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
@@ -219,7 +219,7 @@ define signext i1 @signext_i1_func_void() {
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: signext_i1_func_void:
@@ -259,7 +259,7 @@ define void @test_call_signext_i1_func_void() {
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
@@ -419,8 +419,8 @@ define [2 x i1] @a2i1_func_void() {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: a2i1_func_void:
@@ -451,7 +451,7 @@ define void @test_call_a2i1_func_void() {
 ; GFX9-LABEL: test_call_a2i1_func_void:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -465,10 +465,10 @@ define void @test_call_a2i1_func_void() {
 ; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
@@ -477,7 +477,7 @@ define void @test_call_a2i1_func_void() {
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_mov_b32 s33, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index fb5b4a704b8a1..9d6e78aca8692 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -13,7 +13,7 @@ define i1 @i1_func_void() #0 {
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX789-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX789-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_func_void:
@@ -41,7 +41,7 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX789-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX789-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_zeroext_func_void:
@@ -68,7 +68,7 @@ define signext i1 @i1_signext_func_void() #0 {
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX789-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX789-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_signext_func_void:
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 297b5180dfe9b..9fedb39dad045 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -5,375 +5,334 @@ define void @main(i1 %arg) #0 {
 ; CHECK-LABEL: main:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v8, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v8, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v8, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v8, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v8, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v8, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v8, s40, 6
-; CHECK-NEXT:    v_writelane_b32 v8, s41, 7
-; CHECK-NEXT:    v_writelane_b32 v8, s42, 8
-; CHECK-NEXT:    v_writelane_b32 v8, s43, 9
-; CHECK-NEXT:    v_writelane_b32 v8, s44, 10
-; CHECK-NEXT:    v_writelane_b32 v8, s45, 11
-; CHECK-NEXT:    v_writelane_b32 v8, s46, 12
-; CHECK-NEXT:    v_writelane_b32 v8, s47, 13
-; CHECK-NEXT:    v_writelane_b32 v8, s48, 14
-; CHECK-NEXT:    v_writelane_b32 v8, s49, 15
-; CHECK-NEXT:    s_getpc_b64 s[24:25]
-; CHECK-NEXT:    v_writelane_b32 v8, s50, 16
-; CHECK-NEXT:    s_movk_i32 s4, 0xf0
-; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v8, s51, 17
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
-; CHECK-NEXT:    s_movk_i32 s4, 0x130
-; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v6, s40, 6
+; CHECK-NEXT:    v_writelane_b32 v6, s41, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s42, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s43, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s44, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s45, 11
+; CHECK-NEXT:    v_writelane_b32 v6, s46, 12
+; CHECK-NEXT:    v_writelane_b32 v6, s47, 13
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 14
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 15
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 16
+; CHECK-NEXT:    s_movk_i32 s8, 0xf0
+; CHECK-NEXT:    s_mov_b32 s9, s6
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 17
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[8:9], 0x0
+; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; CHECK-NEXT:    s_mov_b64 s[8:9], 0
+; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[8:9], 0x0
+; CHECK-NEXT:    s_movk_i32 s8, 0x130
+; CHECK-NEXT:    s_mov_b32 s9, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v4, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v4, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v4, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v4, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v4, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v4, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
-; CHECK-NEXT:    s_mov_b32 s20, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s50, 14
-; CHECK-NEXT:    v_mov_b32_e32 v5, s28
-; CHECK-NEXT:    v_mov_b32_e32 v6, v1
-; CHECK-NEXT:    s_mov_b32 s21, s20
-; CHECK-NEXT:    s_mov_b32 s22, s20
-; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_writelane_b32 v4, s51, 15
-; CHECK-NEXT:    v_mov_b32_e32 v2, v1
-; CHECK-NEXT:    image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v2, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v2, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v2, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v2, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v2, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v2, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v2, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v2, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v2, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v2, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v2, s46, 10
+; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[8:9], 0x0
+; CHECK-NEXT:    v_writelane_b32 v2, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v2, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v2, s49, 13
+; CHECK-NEXT:    s_mov_b32 s24, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_writelane_b32 v2, s50, 14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s28
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    s_mov_b32 s25, s24
+; CHECK-NEXT:    s_mov_b32 s26, s24
+; CHECK-NEXT:    s_mov_b32 s27, s24
+; CHECK-NEXT:    v_writelane_b32 v2, s51, 15
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    image_sample_lz v3, v[3:4], s[44:51], s[24:27] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s4, 16
-; CHECK-NEXT:    v_writelane_b32 v4, s5, 17
-; CHECK-NEXT:    v_writelane_b32 v4, s6, 18
-; CHECK-NEXT:    v_writelane_b32 v4, s7, 19
-; CHECK-NEXT:    v_writelane_b32 v4, s8, 20
-; CHECK-NEXT:    v_writelane_b32 v4, s9, 21
-; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v4, s10, 22
-; CHECK-NEXT:    v_writelane_b32 v4, s11, 23
-; CHECK-NEXT:    v_writelane_b32 v4, s12, 24
-; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
-; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
-; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
-; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
-; CHECK-NEXT:    v_writelane_b32 v8, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
-; CHECK-NEXT:    v_writelane_b32 v8, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
-; CHECK-NEXT:    v_writelane_b32 v8, s54, 20
-; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
-; CHECK-NEXT:    s_mov_b32 s4, 48
-; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v8, s55, 21
-; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v8, s56, 22
-; CHECK-NEXT:    v_writelane_b32 v8, s57, 23
-; CHECK-NEXT:    v_writelane_b32 v8, s58, 24
-; CHECK-NEXT:    v_writelane_b32 v8, s59, 25
-; CHECK-NEXT:    v_writelane_b32 v8, s60, 26
+; CHECK-NEXT:    v_writelane_b32 v2, s8, 16
+; CHECK-NEXT:    v_writelane_b32 v2, s9, 17
+; CHECK-NEXT:    v_writelane_b32 v2, s10, 18
+; CHECK-NEXT:    v_writelane_b32 v2, s11, 19
+; CHECK-NEXT:    v_writelane_b32 v2, s12, 20
+; CHECK-NEXT:    v_writelane_b32 v2, s13, 21
+; CHECK-NEXT:    image_sample_lz v4, v[0:1], s[8:15], s[24:27] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v2, s14, 22
+; CHECK-NEXT:    v_writelane_b32 v2, s15, 23
+; CHECK-NEXT:    v_writelane_b32 v2, s16, 24
+; CHECK-NEXT:    v_writelane_b32 v2, s17, 25
+; CHECK-NEXT:    v_writelane_b32 v2, s18, 26
+; CHECK-NEXT:    v_writelane_b32 v2, s19, 27
+; CHECK-NEXT:    v_writelane_b32 v2, s20, 28
+; CHECK-NEXT:    v_writelane_b32 v2, s21, 29
+; CHECK-NEXT:    v_writelane_b32 v2, s22, 30
+; CHECK-NEXT:    v_writelane_b32 v2, s23, 31
+; CHECK-NEXT:    s_mov_b32 s8, 48
+; CHECK-NEXT:    s_mov_b32 s9, s6
+; CHECK-NEXT:    s_movk_i32 s12, 0x1f0
+; CHECK-NEXT:    s_xor_b64 s[14:15], s[4:5], -1
+; CHECK-NEXT:    s_mov_b32 s13, s6
+; CHECK-NEXT:    s_mov_b32 s29, s6
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[8:9], 0x0
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[12:13], 0x0
+; CHECK-NEXT:    v_writelane_b32 v6, s52, 18
+; CHECK-NEXT:    v_writelane_b32 v6, s53, 19
+; CHECK-NEXT:    v_writelane_b32 v6, s54, 20
+; CHECK-NEXT:    v_writelane_b32 v6, s55, 21
+; CHECK-NEXT:    v_writelane_b32 v6, s56, 22
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s4, 32
-; CHECK-NEXT:    v_writelane_b32 v8, s61, 27
-; CHECK-NEXT:    v_writelane_b32 v4, s5, 33
-; CHECK-NEXT:    v_writelane_b32 v8, s62, 28
-; CHECK-NEXT:    v_writelane_b32 v4, s6, 34
-; CHECK-NEXT:    v_writelane_b32 v8, s63, 29
-; CHECK-NEXT:    v_writelane_b32 v4, s7, 35
-; CHECK-NEXT:    v_writelane_b32 v8, s64, 30
-; CHECK-NEXT:    v_writelane_b32 v4, s8, 36
-; CHECK-NEXT:    v_writelane_b32 v8, s65, 31
-; CHECK-NEXT:    v_writelane_b32 v4, s9, 37
-; CHECK-NEXT:    v_writelane_b32 v8, s66, 32
-; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
+; CHECK-NEXT:    v_writelane_b32 v2, s36, 32
+; CHECK-NEXT:    v_writelane_b32 v6, s57, 23
+; CHECK-NEXT:    v_writelane_b32 v2, s37, 33
+; CHECK-NEXT:    v_writelane_b32 v6, s58, 24
+; CHECK-NEXT:    v_writelane_b32 v2, s38, 34
+; CHECK-NEXT:    v_writelane_b32 v6, s59, 25
+; CHECK-NEXT:    v_writelane_b32 v2, s39, 35
+; CHECK-NEXT:    v_writelane_b32 v6, s60, 26
+; CHECK-NEXT:    v_writelane_b32 v2, s40, 36
+; CHECK-NEXT:    v_writelane_b32 v6, s61, 27
+; CHECK-NEXT:    v_writelane_b32 v2, s41, 37
+; CHECK-NEXT:    v_writelane_b32 v6, s62, 28
+; CHECK-NEXT:    v_writelane_b32 v2, s42, 38
+; CHECK-NEXT:    v_writelane_b32 v6, s63, 29
+; CHECK-NEXT:    v_writelane_b32 v2, s43, 39
+; CHECK-NEXT:    v_writelane_b32 v6, s64, 30
+; CHECK-NEXT:    v_writelane_b32 v2, s44, 40
+; CHECK-NEXT:    v_writelane_b32 v6, s65, 31
+; CHECK-NEXT:    v_writelane_b32 v2, s45, 41
+; CHECK-NEXT:    v_writelane_b32 v6, s66, 32
 ; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s27, s24
-; CHECK-NEXT:    s_mov_b32 s29, s24
-; CHECK-NEXT:    v_writelane_b32 v4, s10, 38
-; CHECK-NEXT:    v_writelane_b32 v8, s67, 33
-; CHECK-NEXT:    v_writelane_b32 v4, s11, 39
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[26:27], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[28:29], 0x0
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
-; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v2, s46, 42
+; CHECK-NEXT:    v_writelane_b32 v6, s67, 33
+; CHECK-NEXT:    v_writelane_b32 v2, s47, 43
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT:    v_writelane_b32 v2, s48, 44
+; CHECK-NEXT:    v_writelane_b32 v2, s49, 45
+; CHECK-NEXT:    v_writelane_b32 v2, s50, 46
+; CHECK-NEXT:    v_writelane_b32 v2, s51, 47
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mul_f32_e32 v0, v6, v5
-; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
-; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
+; CHECK-NEXT:    v_mul_f32_e32 v3, v4, v3
+; CHECK-NEXT:    s_and_saveexec_b64 s[12:13], s[14:15]
+; CHECK-NEXT:    s_xor_b64 s[12:13], exec, s[12:13]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
+; CHECK-NEXT:    v_readlane_b32 s36, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v2, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v2, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v2, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v2, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v2, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v2, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v2, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v2, 15
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
-; CHECK-NEXT:    image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
+; CHECK-NEXT:    v_readlane_b32 s37, v2, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v2, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v2, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v2, 4
+; CHECK-NEXT:    image_sample_lz v4, v[0:1], s[44:51], s[24:27] dmask:0x1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_readlane_b32 s41, v2, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v2, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v2, 7
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 32
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 36
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 37
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 38
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 39
-; CHECK-NEXT:    s_mov_b32 s21, s20
-; CHECK-NEXT:    s_mov_b32 s22, s20
-; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 33
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 34
+; CHECK-NEXT:    v_readlane_b32 s36, v2, 32
+; CHECK-NEXT:    v_readlane_b32 s44, v2, 40
+; CHECK-NEXT:    v_readlane_b32 s45, v2, 41
+; CHECK-NEXT:    v_readlane_b32 s46, v2, 42
+; CHECK-NEXT:    v_readlane_b32 s47, v2, 43
+; CHECK-NEXT:    v_readlane_b32 s48, v2, 44
+; CHECK-NEXT:    v_readlane_b32 s49, v2, 45
+; CHECK-NEXT:    v_readlane_b32 s50, v2, 46
+; CHECK-NEXT:    v_readlane_b32 s51, v2, 47
+; CHECK-NEXT:    s_mov_b32 s25, s24
+; CHECK-NEXT:    s_mov_b32 s26, s24
+; CHECK-NEXT:    s_mov_b32 s27, s24
+; CHECK-NEXT:    v_readlane_b32 s37, v2, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v2, 34
+; CHECK-NEXT:    image_sample_lz v5, v[0:1], s[44:51], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s39, v2, 35
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 35
-; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[60:67], s[24:27] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s40, v2, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v2, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v2, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v2, 39
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v6
-; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
-; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v5
+; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v5
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v3
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s12, v4, 32
-; CHECK-NEXT:    v_readlane_b32 s13, v4, 33
-; CHECK-NEXT:    v_readlane_b32 s14, v4, 34
-; CHECK-NEXT:    v_readlane_b32 s15, v4, 35
-; CHECK-NEXT:    v_readlane_b32 s16, v4, 36
-; CHECK-NEXT:    v_readlane_b32 s17, v4, 37
-; CHECK-NEXT:    v_readlane_b32 s18, v4, 38
-; CHECK-NEXT:    v_readlane_b32 s19, v4, 39
-; CHECK-NEXT:    v_writelane_b32 v4, s4, 40
-; CHECK-NEXT:    v_writelane_b32 v4, s5, 41
-; CHECK-NEXT:    v_writelane_b32 v4, s6, 42
-; CHECK-NEXT:    v_writelane_b32 v4, s7, 43
-; CHECK-NEXT:    v_writelane_b32 v4, s8, 44
-; CHECK-NEXT:    v_writelane_b32 v4, s9, 45
-; CHECK-NEXT:    v_writelane_b32 v4, s10, 46
-; CHECK-NEXT:    v_writelane_b32 v4, s11, 47
-; CHECK-NEXT:    v_writelane_b32 v4, s12, 48
-; CHECK-NEXT:    v_writelane_b32 v4, s13, 49
-; CHECK-NEXT:    v_writelane_b32 v4, s14, 50
-; CHECK-NEXT:    v_writelane_b32 v4, s15, 51
-; CHECK-NEXT:    v_writelane_b32 v4, s16, 52
-; CHECK-NEXT:    v_writelane_b32 v4, s17, 53
-; CHECK-NEXT:    v_writelane_b32 v4, s18, 54
-; CHECK-NEXT:    v_writelane_b32 v4, s19, 55
-; CHECK-NEXT:    v_writelane_b32 v4, s52, 56
-; CHECK-NEXT:    v_writelane_b32 v3, s60, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s53, 57
-; CHECK-NEXT:    v_writelane_b32 v3, s61, 1
-; CHECK-NEXT:    v_writelane_b32 v4, s54, 58
-; CHECK-NEXT:    v_writelane_b32 v3, s62, 2
-; CHECK-NEXT:    v_writelane_b32 v4, s55, 59
-; CHECK-NEXT:    v_writelane_b32 v3, s63, 3
-; CHECK-NEXT:    v_writelane_b32 v4, s56, 60
-; CHECK-NEXT:    v_writelane_b32 v3, s64, 4
-; CHECK-NEXT:    v_writelane_b32 v4, s57, 61
-; CHECK-NEXT:    v_writelane_b32 v3, s65, 5
-; CHECK-NEXT:    v_writelane_b32 v4, s58, 62
-; CHECK-NEXT:    v_writelane_b32 v3, s66, 6
-; CHECK-NEXT:    v_writelane_b32 v4, s59, 63
-; CHECK-NEXT:    v_writelane_b32 v3, s67, 7
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[26:27]
+; CHECK-NEXT:    v_readlane_b32 s16, v2, 32
+; CHECK-NEXT:    v_readlane_b32 s17, v2, 33
+; CHECK-NEXT:    v_readlane_b32 s18, v2, 34
+; CHECK-NEXT:    v_readlane_b32 s19, v2, 35
+; CHECK-NEXT:    v_readlane_b32 s20, v2, 36
+; CHECK-NEXT:    v_readlane_b32 s21, v2, 37
+; CHECK-NEXT:    v_readlane_b32 s22, v2, 38
+; CHECK-NEXT:    v_readlane_b32 s23, v2, 39
+; CHECK-NEXT:    v_readlane_b32 s24, v2, 40
+; CHECK-NEXT:    v_readlane_b32 s25, v2, 41
+; CHECK-NEXT:    v_readlane_b32 s26, v2, 42
+; CHECK-NEXT:    v_readlane_b32 s27, v2, 43
+; CHECK-NEXT:    v_readlane_b32 s28, v2, 44
+; CHECK-NEXT:    v_readlane_b32 s29, v2, 45
+; CHECK-NEXT:    v_readlane_b32 s30, v2, 46
+; CHECK-NEXT:    v_readlane_b32 s31, v2, 47
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
-; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
-; CHECK-NEXT:    s_xor_b64 s[22:23], exec, s[8:9]
+; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[14:15]
+; CHECK-NEXT:    s_xor_b64 s[14:15], exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s8
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s36, v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
-; CHECK-NEXT:    image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 16
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 24
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 25
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 26
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 27
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 28
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 29
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 30
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 31
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0
-; CHECK-NEXT:    v_mov_b32_e32 v7, v6
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 17
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 18
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 19
-; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 20
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 21
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 22
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 23
+; CHECK-NEXT:    v_readlane_b32 s37, v2, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v2, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v2, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v2, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v2, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v2, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v2, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v2, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v2, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v2, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v2, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v2, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v2, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v2, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v2, 15
+; CHECK-NEXT:    image_sample_lz v3, v[0:1], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s36, v2, 16
+; CHECK-NEXT:    v_readlane_b32 s44, v2, 24
+; CHECK-NEXT:    v_readlane_b32 s45, v2, 25
+; CHECK-NEXT:    v_readlane_b32 s46, v2, 26
+; CHECK-NEXT:    v_readlane_b32 s47, v2, 27
+; CHECK-NEXT:    v_readlane_b32 s48, v2, 28
+; CHECK-NEXT:    v_readlane_b32 s49, v2, 29
+; CHECK-NEXT:    v_readlane_b32 s50, v2, 30
+; CHECK-NEXT:    v_readlane_b32 s51, v2, 31
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v4
+; CHECK-NEXT:    v_readlane_b32 s37, v2, 17
+; CHECK-NEXT:    v_readlane_b32 s38, v2, 18
+; CHECK-NEXT:    v_readlane_b32 s39, v2, 19
+; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[44:51], s[4:7] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s40, v2, 20
+; CHECK-NEXT:    v_readlane_b32 s41, v2, 21
+; CHECK-NEXT:    v_readlane_b32 s42, v2, 22
+; CHECK-NEXT:    v_readlane_b32 s43, v2, 23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx3 v[5:7], off, s[8:11], 0
+; CHECK-NEXT:    buffer_store_dwordx3 v[3:5], off, s[8:11], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
-; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[22:23]
-; CHECK-NEXT:    v_readlane_b32 s52, v4, 40
-; CHECK-NEXT:    v_readlane_b32 s53, v4, 41
-; CHECK-NEXT:    v_readlane_b32 s54, v4, 42
-; CHECK-NEXT:    v_readlane_b32 s55, v4, 43
-; CHECK-NEXT:    v_readlane_b32 s56, v4, 44
-; CHECK-NEXT:    v_readlane_b32 s57, v4, 45
-; CHECK-NEXT:    v_readlane_b32 s58, v4, 46
-; CHECK-NEXT:    v_readlane_b32 s59, v4, 47
-; CHECK-NEXT:    v_readlane_b32 s60, v4, 48
-; CHECK-NEXT:    v_readlane_b32 s61, v4, 49
-; CHECK-NEXT:    v_readlane_b32 s62, v4, 50
-; CHECK-NEXT:    v_readlane_b32 s63, v4, 51
-; CHECK-NEXT:    v_readlane_b32 s64, v4, 52
-; CHECK-NEXT:    v_readlane_b32 s65, v4, 53
-; CHECK-NEXT:    v_readlane_b32 s66, v4, 54
-; CHECK-NEXT:    v_readlane_b32 s67, v4, 55
+; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[14:15]
+; CHECK-NEXT:    s_mov_b64 s[42:43], s[22:23]
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[20:21]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[18:19]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[16:17]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
-; CHECK-NEXT:    v_mov_b32_e32 v1, s6
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 56
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_mov_b32_e32 v2, s7
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 57
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 58
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 59
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 60
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 61
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 62
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 63
-; CHECK-NEXT:    s_nop 4
-; CHECK-NEXT:    image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1
-; CHECK-NEXT:    ; kill: killed $vgpr1_vgpr2
-; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    image_sample_lz v4, v[0:1], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    image_sample_lz v5, v[0:1], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    ; kill: killed $vgpr0_vgpr1
+; CHECK-NEXT:    s_mov_b64 s[16:17], s[52:53]
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v3, 0
-; CHECK-NEXT:    v_readlane_b32 s45, v3, 1
-; CHECK-NEXT:    v_readlane_b32 s46, v3, 2
-; CHECK-NEXT:    v_readlane_b32 s47, v3, 3
-; CHECK-NEXT:    v_readlane_b32 s48, v3, 4
-; CHECK-NEXT:    v_readlane_b32 s49, v3, 5
-; CHECK-NEXT:    v_readlane_b32 s50, v3, 6
-; CHECK-NEXT:    v_readlane_b32 s51, v3, 7
-; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
-; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
+; CHECK-NEXT:    s_mov_b64 s[18:19], s[54:55]
+; CHECK-NEXT:    s_mov_b64 s[20:21], s[56:57]
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[58:59]
+; CHECK-NEXT:    ; kill: killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43
 ; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
+; CHECK-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v1, v6, v5
-; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_sub_f32_e32 v0, v5, v4
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; CHECK-NEXT:  .LBB0_8: ; %bb33
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_f32_e32 v2, v1, v0
-; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_add_f32_e32 v3, v1, v0
+; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT:    v_readlane_b32 s67, v8, 33
-; CHECK-NEXT:    v_readlane_b32 s66, v8, 32
-; CHECK-NEXT:    v_readlane_b32 s65, v8, 31
-; CHECK-NEXT:    v_readlane_b32 s64, v8, 30
-; CHECK-NEXT:    v_readlane_b32 s63, v8, 29
-; CHECK-NEXT:    v_readlane_b32 s62, v8, 28
-; CHECK-NEXT:    v_readlane_b32 s61, v8, 27
-; CHECK-NEXT:    v_readlane_b32 s60, v8, 26
-; CHECK-NEXT:    v_readlane_b32 s59, v8, 25
-; CHECK-NEXT:    v_readlane_b32 s58, v8, 24
-; CHECK-NEXT:    v_readlane_b32 s57, v8, 23
-; CHECK-NEXT:    v_readlane_b32 s56, v8, 22
-; CHECK-NEXT:    v_readlane_b32 s55, v8, 21
-; CHECK-NEXT:    v_readlane_b32 s54, v8, 20
-; CHECK-NEXT:    v_readlane_b32 s53, v8, 19
-; CHECK-NEXT:    v_readlane_b32 s52, v8, 18
-; CHECK-NEXT:    v_readlane_b32 s51, v8, 17
-; CHECK-NEXT:    v_readlane_b32 s50, v8, 16
-; CHECK-NEXT:    v_readlane_b32 s49, v8, 15
-; CHECK-NEXT:    v_readlane_b32 s48, v8, 14
-; CHECK-NEXT:    v_readlane_b32 s47, v8, 13
-; CHECK-NEXT:    v_readlane_b32 s46, v8, 12
-; CHECK-NEXT:    v_readlane_b32 s45, v8, 11
-; CHECK-NEXT:    v_readlane_b32 s44, v8, 10
-; CHECK-NEXT:    v_readlane_b32 s43, v8, 9
-; CHECK-NEXT:    v_readlane_b32 s42, v8, 8
-; CHECK-NEXT:    v_readlane_b32 s41, v8, 7
-; CHECK-NEXT:    v_readlane_b32 s40, v8, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v8, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v8, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v8, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v8, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v8, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v8, 0
-; CHECK-NEXT:    ; kill: killed $vgpr4
-; CHECK-NEXT:    ; kill: killed $vgpr3
+; CHECK-NEXT:    s_or_b64 exec, exec, s[12:13]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s67, v6, 33
+; CHECK-NEXT:    v_readlane_b32 s66, v6, 32
+; CHECK-NEXT:    v_readlane_b32 s65, v6, 31
+; CHECK-NEXT:    v_readlane_b32 s64, v6, 30
+; CHECK-NEXT:    v_readlane_b32 s63, v6, 29
+; CHECK-NEXT:    v_readlane_b32 s62, v6, 28
+; CHECK-NEXT:    v_readlane_b32 s61, v6, 27
+; CHECK-NEXT:    v_readlane_b32 s60, v6, 26
+; CHECK-NEXT:    v_readlane_b32 s59, v6, 25
+; CHECK-NEXT:    v_readlane_b32 s58, v6, 24
+; CHECK-NEXT:    v_readlane_b32 s57, v6, 23
+; CHECK-NEXT:    v_readlane_b32 s56, v6, 22
+; CHECK-NEXT:    v_readlane_b32 s55, v6, 21
+; CHECK-NEXT:    v_readlane_b32 s54, v6, 20
+; CHECK-NEXT:    v_readlane_b32 s53, v6, 19
+; CHECK-NEXT:    v_readlane_b32 s52, v6, 18
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 17
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 16
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 15
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s47, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s46, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s45, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s44, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s43, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s42, v6, 8
+; CHECK-NEXT:    v_readlane_b32 s41, v6, 7
+; CHECK-NEXT:    v_readlane_b32 s40, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
+; CHECK-NEXT:    ; kill: killed $vgpr2
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 7799b9509ceb0..493ed5956f18f 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -646,12 +646,12 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_in_branch:
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s16, s33
+; GCN-NEXT:    s_mov_b32 s18, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 20
+; GCN-NEXT:    s_mov_b64 exec, s[20:21]
+; GCN-NEXT:    v_writelane_b32 v40, s18, 20
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
@@ -681,9 +681,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; GCN-NEXT:    s_and_saveexec_b64 s[46:47], s[16:17]
 ; GCN-NEXT:    s_cbranch_execz .LBB5_4
 ; GCN-NEXT:  ; %bb.1: ; %bb1
 ; GCN-NEXT:    s_mov_b64 s[48:49], exec
@@ -741,12 +739,12 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_in_branch:
 ; GISEL:       ; %bb.0: ; %bb0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s16, s33
+; GISEL-NEXT:    s_mov_b32 s18, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
-; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GISEL-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
-; GISEL-NEXT:    v_writelane_b32 v40, s16, 20
+; GISEL-NEXT:    s_mov_b64 exec, s[20:21]
+; GISEL-NEXT:    v_writelane_b32 v40, s18, 20
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
@@ -776,9 +774,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GISEL-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; GISEL-NEXT:    s_and_saveexec_b64 s[46:47], s[16:17]
 ; GISEL-NEXT:    s_cbranch_execz .LBB5_4
 ; GISEL-NEXT:  ; %bb.1: ; %bb1
 ; GISEL-NEXT:    s_mov_b64 s[48:49], exec
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 3b3e107a62967..13372dd94619b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -162,9 +162,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-LABEL: func_uses_lds_multi:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-SDAG-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-SDAG-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX8-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX8-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
@@ -199,9 +197,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-GISEL-LABEL: func_uses_lds_multi:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX8-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX8-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX8-GISEL-NEXT:    s_cbranch_execz .LBB2_2
@@ -239,9 +235,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX9-SDAG-LABEL: func_uses_lds_multi:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB2_2
@@ -267,9 +261,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX9-GISEL-LABEL: func_uses_lds_multi:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB2_2
@@ -295,9 +287,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; SDAG-LABEL: func_uses_lds_multi:
 ; SDAG:       ; %bb.0: ; %entry
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; SDAG-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_2
@@ -326,9 +316,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GISEL-LABEL: func_uses_lds_multi:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_3
@@ -462,113 +450,108 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-LABEL: func_uses_lds_phi_after:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-SDAG-NEXT:    flat_load_dword v0, v[1:2] glc
+; GFX8-SDAG-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-SDAG-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %use.bb
-; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
-; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
-; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
+; GFX8-SDAG-NEXT:    ds_write_b32 v0, v2
+; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_trap 2
-; GFX8-SDAG-NEXT:    flat_load_dword v0, v[1:2] glc
+; GFX8-SDAG-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:  .LBB4_2: ; %ret
-; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v0
-; GFX8-GISEL-NEXT:    flat_load_dword v0, v[1:2] glc
+; GFX8-GISEL-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX8-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX8-GISEL-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX8-GISEL-NEXT:  ; %bb.1: ; %use.bb
-; GFX8-GISEL-NEXT:    s_mov_b64 s[6:7], 0xc8
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-GISEL-NEXT:    s_mov_b64 s[4:5], 0xc8
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-GISEL-NEXT:    s_mov_b32 m0, -1
-; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_trap 2
-; GFX8-GISEL-NEXT:    ds_write_b32 v0, v0
-; GFX8-GISEL-NEXT:    flat_load_dword v0, v[1:2] glc
+; GFX8-GISEL-NEXT:    ds_write_b32 v0, v2
+; GFX8-GISEL-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:  .LBB4_2: ; %ret
-; GFX8-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-SDAG-NEXT:    global_load_dword v0, v[1:2], off glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v[0:1], off glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %use.bb
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    ds_write_b32 v0, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    ds_write_b32 v0, v2
 ; GFX9-SDAG-NEXT:    s_trap 2
-; GFX9-SDAG-NEXT:    global_load_dword v0, v[1:2], off glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v[0:1], off glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB4_2: ; %ret
-; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-GISEL-NEXT:    global_load_dword v0, v[1:2], off glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v[0:1], off glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %use.bb
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_trap 2
-; GFX9-GISEL-NEXT:    ds_write_b32 v0, v0
-; GFX9-GISEL-NEXT:    global_load_dword v0, v[1:2], off glc
+; GFX9-GISEL-NEXT:    ds_write_b32 v0, v2
+; GFX9-GISEL-NEXT:    global_load_dword v2, v[0:1], off glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:  .LBB4_2: ; %ret
-; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_uses_lds_phi_after:
 ; SDAG:       ; %bb.0: ; %entry
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v3, v0
-; SDAG-NEXT:    global_load_dword v0, v[1:2], off glc
+; SDAG-NEXT:    global_load_dword v2, v[0:1], off glc
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_and_b32_e32 v3, 1, v3
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB4_3
 ; SDAG-NEXT:  ; %bb.1: ; %use.bb
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-NEXT:    ds_write_b32 v0, v0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    ds_write_b32 v0, v2
 ; SDAG-NEXT:    s_cbranch_execnz .LBB4_4
 ; SDAG-NEXT:  ; %bb.2: ; %use.bb
-; SDAG-NEXT:    global_load_dword v0, v[1:2], off glc
+; SDAG-NEXT:    global_load_dword v2, v[0:1], off glc
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:  .LBB4_3: ; %ret
-; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB4_4:
@@ -577,22 +560,21 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GISEL-LABEL: func_uses_lds_phi_after:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v3, v0
-; GISEL-NEXT:    global_load_dword v0, v[1:2], off glc
+; GISEL-NEXT:    global_load_dword v2, v[0:1], off glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GISEL-NEXT:  ; %bb.1: ; %use.bb
 ; GISEL-NEXT:    s_cbranch_execnz .LBB4_4
 ; GISEL-NEXT:  ; %bb.2: ; %use.bb
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    ds_write_b32 v0, v0
-; GISEL-NEXT:    global_load_dword v0, v[1:2], off glc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    ds_write_b32 v0, v2
+; GISEL-NEXT:    global_load_dword v2, v[0:1], off glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:  .LBB4_3: ; %ret
-; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB4_4:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 27fb4e5f965c9..68043e807f297 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -509,8 +509,7 @@ define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a,
 
 ; SI-LABEL: {{^}}test_fold_and_ord:
 ; SI: s_waitcnt
-; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
+; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
 ; SI-NEXT: s_setpc_b64
 define i1 @test_fold_and_ord(float %a) {
   %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
@@ -521,8 +520,7 @@ define i1 @test_fold_and_ord(float %a) {
 
 ; SI-LABEL: {{^}}test_fold_and_unord:
 ; SI: s_waitcnt
-; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
+; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
 ; SI-NEXT: s_setpc_b64
 define i1 @test_fold_and_unord(float %a) {
   %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index ea823f30f26c2..9afcfdb7a23ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -92,31 +92,31 @@ define i1 @zeromask_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-LABEL: zeromask_bf16:
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: zeromask_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: zeromask_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: zeromask_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT:    s_mov_b32 s4, 0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: zeromask_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT:    s_mov_b32 s0, 0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 0)
   ret i1 %1
@@ -127,31 +127,31 @@ define i1 @allflags_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-LABEL: allflags_bf16:
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX7CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: allflags_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX8CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: allflags_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: allflags_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10CHECK-NEXT:    s_mov_b32 s4, -1
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: allflags_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11CHECK-NEXT:    s_mov_b32 s0, -1
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1023) ; 0x3ff
   ret i1 %1
@@ -168,7 +168,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: snan_bf16:
@@ -180,7 +179,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: snan_bf16:
@@ -192,7 +190,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: snan_bf16:
@@ -202,7 +199,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: snan_bf16:
@@ -212,7 +208,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1)  ; 0x001
   ret i1 %1
@@ -225,8 +220,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
-; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: qnan_bf16:
@@ -234,8 +228,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
-; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: qnan_bf16:
@@ -243,24 +236,21 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: qnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7fbf, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: qnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7fbf, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 2)  ; 0x002
   ret i1 %1
@@ -273,38 +263,33 @@ define i1 @posinf_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posinf_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posinf_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posinf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posinf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 512)  ; 0x200
   ret i1 %1
@@ -317,38 +302,33 @@ define i1 @neginf_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7CHECK-NEXT:    s_mov_b32 s4, 0xff80
-; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: neginf_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0xff80
-; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: neginf_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0xff80
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: neginf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0xff80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: neginf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0xff80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 4)  ; 0x004
   ret i1 %1
@@ -367,7 +347,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v1
 ; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posnormal_bf16:
@@ -379,7 +358,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f00
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posnormal_bf16:
@@ -391,7 +369,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f00
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posnormal_bf16:
@@ -402,7 +379,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f00, v1
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posnormal_bf16:
@@ -413,7 +389,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f00, v1
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 256)  ; 0x100
   ret i1 %1
@@ -432,7 +407,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
 ; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negnormal_bf16:
@@ -444,7 +418,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f00
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negnormal_bf16:
@@ -456,7 +429,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f00
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negnormal_bf16:
@@ -467,7 +439,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f00, v1
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negnormal_bf16:
@@ -478,7 +449,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f00, v1
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 8)  ; 0x008
   ret i1 %1
@@ -493,8 +463,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f
-; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: possubnormal_bf16:
@@ -502,8 +471,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
-; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: possubnormal_bf16:
@@ -511,24 +479,21 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
-; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: possubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
-; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: possubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
-; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 128)  ; 0x080
   ret i1 %1
@@ -546,7 +511,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negsubnormal_bf16:
@@ -558,7 +522,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negsubnormal_bf16:
@@ -570,7 +533,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negsubnormal_bf16:
@@ -581,7 +543,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, -1
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f, v1
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negsubnormal_bf16:
@@ -592,7 +553,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, -1
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f, v1
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 16)  ; 0x010
   ret i1 %1
@@ -604,36 +564,31 @@ define i1 @poszero_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: poszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: poszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: poszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: poszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 64)  ; 0x040
   ret i1 %1
@@ -646,38 +601,33 @@ define i1 @negzero_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7CHECK-NEXT:    s_mov_b32 s4, 0x8000
-; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negzero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negzero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negzero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negzero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 32)  ; 0x020
   ret i1 %1
@@ -690,38 +640,33 @@ define i1 @posfinite_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posfinite_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posfinite_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posfinite_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posfinite_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 448)  ; 0x1c0
   ret i1 %1
@@ -738,7 +683,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negfinite_bf16:
@@ -749,7 +693,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negfinite_bf16:
@@ -760,7 +703,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negfinite_bf16:
@@ -770,7 +712,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v1
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negfinite_bf16:
@@ -780,7 +721,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v1
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 56)  ; 0x038
   ret i1 %1
@@ -793,8 +733,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_bf16:
@@ -802,8 +741,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_bf16:
@@ -811,24 +749,21 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 3)  ; nan
   ret i1 %1
@@ -841,8 +776,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
-; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnan_bf16:
@@ -850,8 +784,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
-; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnan_bf16:
@@ -859,24 +792,21 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
-; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f81, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f81, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1020)  ; ~nan
   ret i1 %class
@@ -1130,8 +1060,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_bf16:
@@ -1139,8 +1068,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_bf16:
@@ -1148,24 +1076,21 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 516)  ; 0x204 = "inf"
   ret i1 %1
@@ -1178,8 +1103,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_bf16:
@@ -1187,8 +1111,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_bf16:
@@ -1196,24 +1119,21 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %1
@@ -1226,40 +1146,35 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 240)  ; 0xf0 = "subnormal|zero"
@@ -1273,40 +1188,35 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX7CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
     %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 783)  ; ~0xf0 = "~(subnormal|zero)"
@@ -1322,8 +1232,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
 ; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f00
-; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnormal_bf16:
@@ -1332,8 +1241,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f00
-; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnormal_bf16:
@@ -1342,8 +1250,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f00
-; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnormal_bf16:
@@ -1351,8 +1258,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
-; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f00, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnormal_bf16:
@@ -1360,8 +1266,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
-; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f00, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 264)  ; 0x108 = "normal"
   ret i1 %class
@@ -1376,8 +1281,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
 ; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7eff
-; GFX7CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnormal_bf16:
@@ -1386,8 +1290,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7eff
-; GFX8CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnormal_bf16:
@@ -1396,8 +1299,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7eff
-; GFX9CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnormal_bf16:
@@ -1405,8 +1307,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
-; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_u16_e64 s4, 0x7eff, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnormal_bf16:
@@ -1414,8 +1315,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
-; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_u16_e64 s0, 0x7eff, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 759)  ; ~0x108 = "~normal"
   ret i1 %class
@@ -1434,7 +1334,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
 ; GFX7CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_is_plus_normal_bf16:
@@ -1446,7 +1345,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7eff
 ; GFX8CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_is_plus_normal_bf16:
@@ -1458,7 +1356,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7eff
 ; GFX9CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_is_plus_normal_bf16:
@@ -1469,7 +1366,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e64 s4, 0x7eff, v1
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_is_plus_normal_bf16:
@@ -1480,7 +1376,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e64 s0, 0x7eff, v1
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 767)  ; ~0x100 = ~"+normal"
   ret i1 %class
@@ -1499,7 +1394,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v1
 ; GFX7CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_is_neg_normal_bf16:
@@ -1511,7 +1405,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7eff
 ; GFX8CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_is_neg_normal_bf16:
@@ -1523,7 +1416,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7eff
 ; GFX9CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_is_neg_normal_bf16:
@@ -1534,7 +1426,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e64 s4, 0x7eff, v1
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_is_neg_normal_bf16:
@@ -1545,7 +1436,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e64 s0, 0x7eff, v1
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1015)  ; ~0x008 = ~"-normal"
   ret i1 %class
@@ -1559,8 +1449,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f
-; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: issubnormal_bf16:
@@ -1569,8 +1458,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
-; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: issubnormal_bf16:
@@ -1579,8 +1467,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
-; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: issubnormal_bf16:
@@ -1588,8 +1475,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
-; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: issubnormal_bf16:
@@ -1597,8 +1483,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
-; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %class
@@ -1612,8 +1497,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7e
-; GFX7CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_issubnormal_bf16:
@@ -1622,8 +1506,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7e
-; GFX8CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_issubnormal_bf16:
@@ -1632,8 +1515,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7e
-; GFX9CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_issubnormal_bf16:
@@ -1641,8 +1523,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
-; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_u16_e64 s4, 0x7e, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_issubnormal_bf16:
@@ -1650,8 +1531,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
-; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_u16_e64 s0, 0x7e, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 879)  ; ~0x90 = ~"subnormal"
   ret i1 %class
@@ -1663,40 +1543,35 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
-; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 96)  ; 0x60 = "zero"
   ret i1 %class
@@ -1708,40 +1583,35 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
-; GFX7CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 927)  ; ~0x60 = ~"zero"
   ret i1 %class
@@ -1754,38 +1624,33 @@ define i1 @ispositive_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
-; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: ispositive_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
-; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: ispositive_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
-; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: ispositive_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f81, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: ispositive_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f81, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 960)  ; fcPositive
   ret i1 %class
@@ -1808,7 +1673,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s6, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_ispositive_bf16:
@@ -1824,7 +1688,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v1
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_ispositive_bf16:
@@ -1840,7 +1703,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v1
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_ispositive_bf16:
@@ -1854,7 +1716,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s5
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s6
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_ispositive_bf16:
@@ -1868,7 +1729,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s2
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 63)  ; ~fcPositive
   ret i1 %class
@@ -1889,7 +1749,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnegative_bf16:
@@ -1903,7 +1762,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s6, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnegative_bf16:
@@ -1917,7 +1775,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s6, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnegative_bf16:
@@ -1929,7 +1786,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v1
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s5
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnegative_bf16:
@@ -1941,7 +1797,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v1
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s1
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 60)  ; fcNegative
   ret i1 %class
@@ -1959,7 +1814,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v1
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnegative_bf16:
@@ -1971,7 +1825,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnegative_bf16:
@@ -1983,7 +1836,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnegative_bf16:
@@ -1993,7 +1845,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v1
 ; GFX10CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnegative_bf16:
@@ -2003,7 +1854,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v1
 ; GFX11CHECK-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 963)  ; ~fcNegative
   ret i1 %class
@@ -2019,7 +1869,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_bf16:
@@ -2030,7 +1879,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_bf16:
@@ -2041,7 +1889,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_bf16:
@@ -2051,7 +1898,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_bf16:
@@ -2061,7 +1907,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2078,7 +1923,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_f_daz:
@@ -2089,7 +1933,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_f_daz:
@@ -2100,7 +1943,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_f_daz:
@@ -2110,7 +1952,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_f_daz:
@@ -2120,7 +1961,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2137,7 +1977,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2148,7 +1987,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2159,7 +1997,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2169,7 +2006,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2179,7 +2015,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2196,7 +2031,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_bf16:
@@ -2207,7 +2041,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_bf16:
@@ -2218,7 +2051,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_bf16:
@@ -2228,7 +2060,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_bf16:
@@ -2238,7 +2069,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924)  ; ~0x60 = "~(zero|nan)"
@@ -2255,7 +2085,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz:
@@ -2266,7 +2095,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz:
@@ -2277,7 +2105,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz:
@@ -2287,7 +2114,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz:
@@ -2297,7 +2123,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2314,7 +2139,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2325,7 +2149,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2336,7 +2159,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2346,7 +2168,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2356,7 +2177,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2373,7 +2193,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_qnan_bf16:
@@ -2384,7 +2203,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_qnan_bf16:
@@ -2395,7 +2213,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_qnan_bf16:
@@ -2405,7 +2222,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_qnan_bf16:
@@ -2415,7 +2231,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 98)  ; 0x60|0x2 = "zero|qnan"
@@ -2435,7 +2250,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_snan_bf16:
@@ -2449,7 +2263,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_snan_bf16:
@@ -2463,7 +2276,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_snan_bf16:
@@ -2475,7 +2287,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s5, s4
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_snan_bf16:
@@ -2487,7 +2298,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s1, s0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 97)  ; 0x60|0x1 = "zero|snan"
@@ -2516,7 +2326,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7f00
 ; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_qnan_bf16:
@@ -2538,7 +2347,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_movk_i32 s6, 0x7f00
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s6, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_qnan_bf16:
@@ -2560,7 +2368,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7f00
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s6, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_qnan_bf16:
@@ -2578,7 +2385,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_or_b32 s5, s6, s5
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s5, s4
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_qnan_bf16:
@@ -2596,7 +2402,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_or_b32 s1, s2, s1
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 925)  ; ~(0x60|0x2) = "~(zero|qnan)"
@@ -2623,7 +2428,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7f00
 ; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_snan_bf16:
@@ -2643,7 +2447,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s6, v0
 ; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_snan_bf16:
@@ -2663,7 +2466,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s6, v0
 ; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_snan_bf16:
@@ -2679,7 +2481,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s5
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s6
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_snan_bf16:
@@ -2695,7 +2496,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s2
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 926)  ; ~(0x60|0x1) = "~(zero|snan)"
@@ -2709,8 +2509,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
-; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_or_nan_bf16:
@@ -2718,8 +2517,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
-; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_or_nan_bf16:
@@ -2727,24 +2525,21 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f7f, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f7f, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 519)  ; 0x204|0x3 = "inf|nan"
@@ -2758,8 +2553,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isinf_or_nan_bf16:
@@ -2767,8 +2561,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isinf_or_nan_bf16:
@@ -2776,24 +2569,21 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 504)  ; ~(0x204|0x3) = "~(inf|nan)"
@@ -2807,8 +2597,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_or_nan_f:
@@ -2816,8 +2605,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_or_nan_f:
@@ -2825,24 +2613,21 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 507)  ; 0x1f8|0x3 = "finite|nan"
@@ -2856,8 +2641,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX7CHECK-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
 ; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isfinite_or_nan_f:
@@ -2865,8 +2649,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isfinite_or_nan_f:
@@ -2874,24 +2657,21 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 516)  ; ~(0x1f8|0x3) = "~(finite|nan)"
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index da64c379672ef..74138ce83e095 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -99,31 +99,31 @@ define i1 @zeromask_f16(half %x) nounwind {
 ; GFX7CHECK-LABEL: zeromask_f16:
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: zeromask_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: zeromask_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: zeromask_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT:    s_mov_b32 s4, 0
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: zeromask_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT:    s_mov_b32 s0, 0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 0)
   ret i1 %1
@@ -131,35 +131,65 @@ define i1 @zeromask_f16(half %x) nounwind {
 
 ; FIXME: DAG and GlobalISel return different values for i1 true
 define i1 @allflags_f16(half %x) nounwind {
-; GFX7CHECK-LABEL: allflags_f16:
-; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT:    v_mov_b32_e32 v0, 1
-; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX7SELDAG-LABEL: allflags_f16:
+; GFX7SELDAG:       ; %bb.0:
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7SELDAG-NEXT:    s_mov_b64 s[4:5], -1
+; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8CHECK-LABEL: allflags_f16:
-; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX7GLISEL-LABEL: allflags_f16:
+; GFX7GLISEL:       ; %bb.0:
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7GLISEL-NEXT:    s_mov_b64 s[4:5], 1
+; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9CHECK-LABEL: allflags_f16:
-; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX8SELDAG-LABEL: allflags_f16:
+; GFX8SELDAG:       ; %bb.0:
+; GFX8SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8SELDAG-NEXT:    s_mov_b64 s[4:5], -1
+; GFX8SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10CHECK-LABEL: allflags_f16:
-; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 1
-; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX8GLISEL-LABEL: allflags_f16:
+; GFX8GLISEL:       ; %bb.0:
+; GFX8GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GLISEL-NEXT:    s_mov_b64 s[4:5], 1
+; GFX8GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11CHECK-LABEL: allflags_f16:
-; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 1
-; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9SELDAG-LABEL: allflags_f16:
+; GFX9SELDAG:       ; %bb.0:
+; GFX9SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9SELDAG-NEXT:    s_mov_b64 s[4:5], -1
+; GFX9SELDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GLISEL-LABEL: allflags_f16:
+; GFX9GLISEL:       ; %bb.0:
+; GFX9GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GLISEL-NEXT:    s_mov_b64 s[4:5], 1
+; GFX9GLISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10SELDAG-LABEL: allflags_f16:
+; GFX10SELDAG:       ; %bb.0:
+; GFX10SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10SELDAG-NEXT:    s_mov_b32 s4, -1
+; GFX10SELDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10GLISEL-LABEL: allflags_f16:
+; GFX10GLISEL:       ; %bb.0:
+; GFX10GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10GLISEL-NEXT:    s_mov_b32 s4, 1
+; GFX10GLISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11SELDAG-LABEL: allflags_f16:
+; GFX11SELDAG:       ; %bb.0:
+; GFX11SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11SELDAG-NEXT:    s_mov_b32 s0, -1
+; GFX11SELDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11GLISEL-LABEL: allflags_f16:
+; GFX11GLISEL:       ; %bb.0:
+; GFX11GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11GLISEL-NEXT:    s_mov_b32 s0, 1
+; GFX11GLISEL-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 1023) ; 0x3ff
   ret i1 %1
 }
@@ -175,7 +205,6 @@ define i1 @snan_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s5, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: snan_f16:
@@ -185,36 +214,31 @@ define i1 @snan_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xffff83ff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x1ff
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: snan_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: snan_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: snan_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 1
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: snan_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 1
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 1)  ; 0x001
   ret i1 %1
@@ -227,8 +251,7 @@ define i1 @qnan_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7dff
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: qnan_f16:
@@ -237,36 +260,31 @@ define i1 @qnan_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7e00
-; GFX7GLISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: qnan_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 2
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: qnan_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 2
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: qnan_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 2
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: qnan_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 2
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 2)  ; 0x002
   ret i1 %1
@@ -278,8 +296,7 @@ define i1 @posinf_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
-; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: posinf_f16:
@@ -287,38 +304,33 @@ define i1 @posinf_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posinf_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x200
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posinf_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x200
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posinf_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x200
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posinf_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x200
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 512)  ; 0x200
   ret i1 %1
@@ -330,8 +342,7 @@ define i1 @neginf_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_mov_b32 s4, 0xfc00
-; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: neginf_f16:
@@ -339,36 +350,31 @@ define i1 @neginf_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0xfc00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: neginf_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 4
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: neginf_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 4
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: neginf_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 4
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: neginf_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 4
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 4)  ; 0x004
   ret i1 %1
@@ -387,7 +393,6 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v1
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: posnormal_f16:
@@ -402,37 +407,32 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x100
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x100
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x100
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x100
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 256)  ; 0x100
   ret i1 %1
@@ -451,7 +451,6 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: negnormal_f16:
@@ -466,35 +465,30 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 8
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 8
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 8
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 8
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 8)  ; 0x008
   ret i1 %1
@@ -508,8 +502,7 @@ define i1 @possubnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x3ff
 ; GFX7SELDAG-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: possubnormal_f16:
@@ -518,38 +511,33 @@ define i1 @possubnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: possubnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x80
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: possubnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x80
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: possubnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x80
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: possubnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x80
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 128)  ; 0x080
   ret i1 %1
@@ -567,7 +555,6 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: negsubnormal_f16:
@@ -582,35 +569,30 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negsubnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 16
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negsubnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 16
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negsubnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 16
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negsubnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 16
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 16)  ; 0x010
   ret i1 %1
@@ -621,44 +603,38 @@ define i1 @poszero_f16(half %x) nounwind {
 ; GFX7SELDAG:       ; %bb.0:
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: poszero_f16:
 ; GFX7GLISEL:       ; %bb.0:
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: poszero_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 64
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: poszero_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 64
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: poszero_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 64
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: poszero_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 64
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 64)  ; 0x040
   ret i1 %1
@@ -670,8 +646,7 @@ define i1 @negzero_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_mov_b32 s4, 0x8000
-; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: negzero_f16:
@@ -679,36 +654,31 @@ define i1 @negzero_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x8000
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negzero_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 32
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negzero_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 32
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negzero_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 32
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negzero_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 32
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 32)  ; 0x020
   ret i1 %1
@@ -720,8 +690,7 @@ define i1 @posfinite_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
-; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: posfinite_f16:
@@ -729,38 +698,33 @@ define i1 @posfinite_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posfinite_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1c0
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posfinite_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1c0
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posfinite_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x1c0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posfinite_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x1c0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 448)  ; 0x1c0
   ret i1 %1
@@ -777,7 +741,6 @@ define i1 @negfinite_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: negfinite_f16:
@@ -790,35 +753,30 @@ define i1 @negfinite_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v0, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v1, v0
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negfinite_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 56
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negfinite_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 56
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negfinite_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 56
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negfinite_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 56
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 56)  ; 0x038
   ret i1 %1
@@ -831,8 +789,7 @@ define i1 @isnan_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnan_f16:
@@ -841,36 +798,31 @@ define i1 @isnan_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 3
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3)  ; nan
   ret i1 %1
@@ -883,8 +835,7 @@ define i1 @not_isnan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isnan_f16:
@@ -893,38 +844,33 @@ define i1 @not_isnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c01
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnan_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x3fc
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnan_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x3fc
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnan_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x3fc
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnan_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x3fc
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = call i1 @llvm.is.fpclass.f16(half %x, i32 1020)  ; ~nan
   ret i1 %class
@@ -1316,8 +1262,7 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
 ; GFX7SELDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnan_f16_strictfp:
@@ -1326,36 +1271,31 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f16_strictfp:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f16_strictfp:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f16_strictfp:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f16_strictfp:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 3
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3) strictfp ; nan
   ret i1 %1
@@ -1368,8 +1308,7 @@ define i1 @isinf_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isinf_f16:
@@ -1378,38 +1317,33 @@ define i1 @isinf_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x204
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 516)  ; 0x204 = "inf"
   ret i1 %1
@@ -1422,8 +1356,7 @@ define i1 @isfinite_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isfinite_f16:
@@ -1432,38 +1365,33 @@ define i1 @isfinite_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x1f8
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x1f8
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %1
@@ -1475,8 +1403,7 @@ define i1 @issubnormal_or_zero_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
-; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: issubnormal_or_zero_f16:
@@ -1484,38 +1411,33 @@ define i1 @issubnormal_or_zero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: issubnormal_or_zero_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0xf0
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: issubnormal_or_zero_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0xf0
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: issubnormal_or_zero_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0xf0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: issubnormal_or_zero_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0xf0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 240)  ; 0xf0 = "subnormal|zero"
@@ -1528,8 +1450,7 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
-; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_issubnormal_or_zero_f16:
@@ -1544,37 +1465,32 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_issubnormal_or_zero_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x30f
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_issubnormal_or_zero_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x30f
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_issubnormal_or_zero_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x30f
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_issubnormal_or_zero_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x30f
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
     %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 783)  ; ~0xf0 = "~(subnormal|zero)"
@@ -1590,8 +1506,7 @@ define i1 @isnormal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnormal_f16:
@@ -1601,38 +1516,33 @@ define i1 @isnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x108
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x108
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 264)  ; 0x108 = "normal"
   ret i1 %class
@@ -1647,8 +1557,7 @@ define i1 @not_isnormal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffc00, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isnormal_f16:
@@ -1664,37 +1573,32 @@ define i1 @not_isnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x2f7
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x2f7
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x2f7
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x2f7
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 759)  ; ~0x108 = "~normal"
   ret i1 %class
@@ -1713,7 +1617,6 @@ define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
 ; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_is_plus_normal_f16:
@@ -1737,37 +1640,32 @@ define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_is_plus_normal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x2ff
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_is_plus_normal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x2ff
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_is_plus_normal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x2ff
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_is_plus_normal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x2ff
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 767)  ; ~0x100 = ~"+normal"
   ret i1 %class
@@ -1786,7 +1684,6 @@ define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v1
 ; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_is_neg_normal_f16:
@@ -1810,37 +1707,32 @@ define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_is_neg_normal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x3f7
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_is_neg_normal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x3f7
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_is_neg_normal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x3f7
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_is_neg_normal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x3f7
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 1015)  ; ~0x008 = ~"-normal"
   ret i1 %class
@@ -1854,8 +1746,7 @@ define i1 @issubnormal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x3ff
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
-; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: issubnormal_f16:
@@ -1865,38 +1756,33 @@ define i1 @issubnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: issubnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: issubnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: issubnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x90
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: issubnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x90
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %class
@@ -1910,8 +1796,7 @@ define i1 @not_issubnormal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x3fe
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
-; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_issubnormal_f16:
@@ -1930,37 +1815,32 @@ define i1 @not_issubnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_issubnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x36f
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_issubnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x36f
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_issubnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x36f
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_issubnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x36f
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 879)  ; ~0x90 = ~"subnormal"
   ret i1 %class
@@ -1972,8 +1852,7 @@ define i1 @iszero_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_f16:
@@ -1981,38 +1860,33 @@ define i1 @iszero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x60
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x60
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 96)  ; 0x60 = "zero"
   ret i1 %class
@@ -2024,8 +1898,7 @@ define i1 @not_iszero_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_f16:
@@ -2047,37 +1920,32 @@ define i1 @not_iszero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39f
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39f
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39f
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39f
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 927)  ; ~0x60 = ~"zero"
   ret i1 %class
@@ -2089,8 +1957,7 @@ define i1 @ispositive_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
-; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: ispositive_f16:
@@ -2098,38 +1965,33 @@ define i1 @ispositive_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c01
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: ispositive_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x3c0
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: ispositive_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x3c0
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: ispositive_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x3c0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: ispositive_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x3c0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 960)  ; fcPositive
   ret i1 %class
@@ -2151,7 +2013,6 @@ define i1 @not_ispositive_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s6, v2
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_ispositive_f16:
@@ -2169,35 +2030,30 @@ define i1 @not_ispositive_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_ispositive_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 63
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_ispositive_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 63
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_ispositive_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 63
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_ispositive_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 63
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 63)  ; ~fcPositive
   ret i1 %class
@@ -2217,7 +2073,6 @@ define i1 @isnegative_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnegative_f16:
@@ -2233,35 +2088,30 @@ define i1 @isnegative_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnegative_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 60
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnegative_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 60
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnegative_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 60
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnegative_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 60
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 60)  ; fcNegative
   ret i1 %class
@@ -2273,12 +2123,11 @@ define i1 @not_isnegative_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
-; GFX7SELDAG-NEXT:    s_movk_i32 s5, 0x7c00
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s5, v0
+; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
+; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isnegative_f16:
@@ -2292,37 +2141,32 @@ define i1 @not_isnegative_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnegative_f16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x3c3
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnegative_f16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x3c3
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnegative_f16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x3c3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnegative_f16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x3c3
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 963)  ; ~fcNegative
   ret i1 %class
@@ -2338,7 +2182,6 @@ define i1 @iszero_or_nan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_nan_f16:
@@ -2348,38 +2191,33 @@ define i1 @iszero_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xffff83ff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0xffff8400
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x63
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x63
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2396,7 +2234,6 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_nan_f_daz:
@@ -2406,38 +2243,33 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xffff83ff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0xffff8400
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x63
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x63
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2454,7 +2286,6 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2464,38 +2295,33 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xffff83ff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0xffff8400
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x63
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x63
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2512,7 +2338,6 @@ define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_nan_f16:
@@ -2532,37 +2357,32 @@ define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39c
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39c
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924)  ; ~0x60 = "~(zero|nan)"
@@ -2579,7 +2399,6 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_nan_f_daz:
@@ -2599,37 +2418,32 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39c
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39c
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2646,7 +2460,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2666,37 +2479,32 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39c
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39c
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2713,7 +2521,6 @@ define i1 @iszero_or_qnan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_qnan_f16:
@@ -2723,38 +2530,33 @@ define i1 @iszero_or_qnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8200, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0xffff8201
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_qnan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x62
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_qnan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x62
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_qnan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x62
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_qnan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x62
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 98)  ; 0x60|0x2 = "zero|qnan"
@@ -2774,7 +2576,6 @@ define i1 @iszero_or_snan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_snan_f16:
@@ -2787,37 +2588,32 @@ define i1 @iszero_or_snan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x1ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_snan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x61
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_snan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x61
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_snan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x61
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_snan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x61
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 97)  ; 0x60|0x1 = "zero|snan"
@@ -2846,7 +2642,6 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_movk_i32 s6, 0x7800
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_qnan_f16:
@@ -2870,37 +2665,32 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_qnan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39d
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_qnan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39d
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_qnan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39d
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_qnan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39d
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 925)  ; ~(0x60|0x2) = "~(zero|qnan)"
@@ -2927,7 +2717,6 @@ define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    s_movk_i32 s6, 0x7800
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_snan_f16:
@@ -2950,37 +2739,32 @@ define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_snan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39e
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_snan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39e
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_snan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39e
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_snan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39e
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 926)  ; ~(0x60|0x1) = "~(zero|snan)"
@@ -2994,8 +2778,7 @@ define i1 @isinf_or_nan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7bff
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isinf_or_nan_f16:
@@ -3004,38 +2787,33 @@ define i1 @isinf_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_or_nan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x207
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_or_nan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x207
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_or_nan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x207
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_or_nan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x207
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 519)  ; 0x204|0x3 = "inf|nan"
@@ -3049,8 +2827,7 @@ define i1 @not_isinf_or_nan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isinf_or_nan_f16:
@@ -3059,38 +2836,33 @@ define i1 @not_isinf_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isinf_or_nan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isinf_or_nan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isinf_or_nan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x1f8
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isinf_or_nan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x1f8
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 504)  ; ~(0x204|0x3) = "~(inf|nan)"
@@ -3104,8 +2876,7 @@ define i1 @isfinite_or_nan_f(half %x) {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isfinite_or_nan_f:
@@ -3114,38 +2885,33 @@ define i1 @isfinite_or_nan_f(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_or_nan_f:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1fb
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_or_nan_f:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1fb
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x1fb
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x1fb
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 507)  ; 0x1f8|0x3 = "finite|nan"
@@ -3159,8 +2925,7 @@ define i1 @not_isfinite_or_nan_f(half %x) {
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isfinite_or_nan_f:
@@ -3169,38 +2934,33 @@ define i1 @not_isfinite_or_nan_f(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
-; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
-; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x204
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 516)  ; ~(0x1f8|0x3) = "~(finite|nan)"
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index 347e549e7cf56..37217ed6d64f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -180,36 +180,30 @@ define i1 @isnan_f32(float %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f32:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f32:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f32:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f32:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3)  ; nan
   ret i1 %1
@@ -989,36 +983,30 @@ define i1 @isnan_f64(double %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f64:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f64:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f64:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f64:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 3
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3)  ; nan
   ret i1 %1
@@ -1029,36 +1017,30 @@ define i1 @isnan_f32_strictfp(float %x) strictfp nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f32_strictfp:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f32_strictfp:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f32_strictfp:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f32_strictfp:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) strictfp ; nan
   ret i1 %1
@@ -1069,36 +1051,30 @@ define i1 @isnan_f64_strictfp(double %x) strictfp nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f64_strictfp:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f64_strictfp:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f64_strictfp:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f64_strictfp:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 3
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) strictfp ; nan
   ret i1 %1
@@ -1109,39 +1085,33 @@ define i1 @isinf_f32(float %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
-; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_f32:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
-; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_f32:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
-; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_f32:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x204
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_f32:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x204
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
   ret i1 %1
@@ -1152,39 +1122,33 @@ define i1 @isinf_f64(double %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v2, 0x204
-; GFX7CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v2
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_f64:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x204
-; GFX8CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v2
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_f64:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v2, 0x204
-; GFX9CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v2
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_f64:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 0x204
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_f64:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 0x204
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 516)  ; 0x204 = "inf"
   ret i1 %1
@@ -1195,39 +1159,33 @@ define i1 @isfinite_f32(float %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
-; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_f32:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
-; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_f32:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
-; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_f32:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x1f8
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_f32:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x1f8
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %1
@@ -1238,39 +1196,33 @@ define i1 @isfinite_f64(double %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f8
-; GFX7CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v2
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_f64:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f8
-; GFX8CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v2
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_f64:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f8
-; GFX9CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v2
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_f64:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 0x1f8
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_f64:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 0x1f8
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %1
@@ -1281,39 +1233,33 @@ define i1 @isnormal_f32(float %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
-; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnormal_f32:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
-; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnormal_f32:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
-; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnormal_f32:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x108
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnormal_f32:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x108
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 264)  ; 0x108 = "normal"
   ret i1 %1
@@ -1377,39 +1323,33 @@ define i1 @issubnormal_f32(float %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
-; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: issubnormal_f32:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
-; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: issubnormal_f32:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
-; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: issubnormal_f32:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x90
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: issubnormal_f32:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x90
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %1
@@ -1420,39 +1360,33 @@ define i1 @iszero_f32(float %x) nounwind {
 ; GFX7CHECK:       ; %bb.0:
 ; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
-; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_f32:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
-; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_f32:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
-; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v1
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_f32:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x60
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_f32:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x60
-; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 96)  ; 0x60 = "zero"
   ret i1 %1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index c3e665fa8269a..779f82454e379 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -16,17 +16,16 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
 ; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
-; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
-; SI-NEXT:    v_add_i32_e32 v6, vcc, v1, v5
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; SI-NEXT:    v_add_i32_e32 v7, vcc, v1, v5
 ; SI-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v5
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
-; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; SI-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
+; SI-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[3:4]
+; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: umulo_i64_v_v:
@@ -46,9 +45,8 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: umulo_i64_v_v:
@@ -69,8 +67,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: umulo_i64_v_v:
@@ -95,8 +92,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u64_e64 s0, 0, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: umulo_i64_v_v:
@@ -125,8 +121,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e64 s0, 0, v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
@@ -137,38 +132,36 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; SI-LABEL: smulo_i64_v_v:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
-; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
-; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
-; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
-; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
-; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
-; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
-; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
-; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; SI-NEXT:    v_add_i32_e32 v9, vcc, v8, v5
-; SI-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; SI-NEXT:    v_addc_u32_e32 v8, vcc, v7, v6, vcc
-; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; SI-NEXT:    v_mov_b32_e32 v7, v6
-; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; SI-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v9, vcc
+; SI-NEXT:    v_mul_hi_u32 v5, v1, v2
+; SI-NEXT:    v_mul_lo_u32 v4, v1, v2
+; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
+; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
+; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
+; SI-NEXT:    v_mul_hi_i32 v9, v1, v3
+; SI-NEXT:    v_mul_lo_u32 v10, v1, v3
+; SI-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; SI-NEXT:    v_add_i32_e32 v8, vcc, v7, v4
+; SI-NEXT:    v_add_i32_e64 v4, s[4:5], v7, v4
+; SI-NEXT:    v_addc_u32_e32 v7, vcc, v6, v5, vcc
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, v5
+; SI-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; SI-NEXT:    v_sub_i32_e32 v9, vcc, v7, v2
+; SI-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v8, vcc
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v10, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    v_sub_i32_e32 v9, vcc, v7, v0
 ; SI-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
-; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v4
-; SI-NEXT:    v_mov_b32_e32 v1, v5
+; SI-NEXT:    v_cndmask_b32_e32 v8, v1, v8, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[7:8], v[5:6]
+; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: smulo_i64_v_v:
@@ -201,8 +194,7 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v4
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: smulo_i64_v_v:
@@ -226,17 +218,16 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v7, v2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v7, v4
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u64_e64 s4, v[4:5], v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: smulo_i64_v_v:
@@ -265,19 +256,18 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc_lo
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v7, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_cndmask_b32 v4, v7, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u64_e64 s0, v[4:5], v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: smulo_i64_v_v:
@@ -310,19 +300,18 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
 ; GFX12-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX12-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v7, v7, v2, vcc_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc_lo
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX12-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX12-NEXT:    v_sub_co_u32 v4, vcc_lo, v7, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v5, vcc_lo
 ; GFX12-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_cndmask_b32 v4, v7, v4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
-; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e64 s0, v[4:5], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
@@ -719,50 +708,45 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; SI-LABEL: smulo_i64_v_4:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
-; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
-; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v5
-; SI-NEXT:    v_mov_b32_e32 v1, v4
+; SI-NEXT:    v_lshl_b64 v[2:3], v[0:1], 2
+; SI-NEXT:    v_ashr_i64 v[3:4], v[2:3], 2
+; SI-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[3:4], v[0:1]
+; SI-NEXT:    v_alignbit_b32 v1, v1, v0, 30
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: smulo_i64_v_4:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX9-NEXT:    v_ashrrev_i64 v[4:5], 2, v[2:3]
 ; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
-; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: smulo_i64_v_4:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX10-NEXT:    v_ashrrev_i64 v[4:5], 2, v[2:3]
 ; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
-; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
-; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_cmp_ne_u64_e64 s4, v[4:5], v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: smulo_i64_v_4:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ashrrev_i64 v[4:5], 2, v[2:3]
 ; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: smulo_i64_v_4:
@@ -772,14 +756,13 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b64_e32 v[4:5], 2, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_ashrrev_i64 v[4:5], 2, v[2:3]
 ; GFX12-NEXT:    v_alignbit_b32 v3, v1, v0, 30
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
-; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e64 s0, v[4:5], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
@@ -790,52 +773,46 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; SI-LABEL: umulo_i64_v_4:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
-; SI-NEXT:    v_mov_b32_e32 v6, v0
-; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
-; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v4
-; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_and_b32_e32 v5, 0x3fffffff, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    v_lshl_b64 v[2:3], v[0:1], 2
+; SI-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], v[0:1]
+; SI-NEXT:    v_alignbit_b32 v1, v1, v0, 30
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: umulo_i64_v_4:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
-; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x3fffffff, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[2:3], v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX9-NEXT:    v_alignbit_b32 v1, v1, v0, 30
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: umulo_i64_v_4:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX10-NEXT:    v_and_b32_e32 v5, 0x3fffffff, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
 ; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
-; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_cmp_ne_u64_e64 s4, v[4:5], v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: umulo_i64_v_4:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v5, 0x3fffffff, v1
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
 ; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: umulo_i64_v_4:
@@ -845,13 +822,12 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
-; GFX12-NEXT:    v_lshlrev_b64_e32 v[4:5], 2, v[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v5, 0x3fffffff, v1
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[0:1]
 ; GFX12-NEXT:    v_alignbit_b32 v3, v1, v0, 30
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e64 s0, v[4:5], v[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
diff --git a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
index 546022b4f9c43..de62bac2b5f9c 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
@@ -19,20 +19,18 @@ define void @loop_on_argument(i1 %arg) {
 ; CHECK-LABEL: loop_on_argument:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:  .LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 s[6:7], exec, vcc
-; CHECK-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
 ; CHECK-NEXT:    global_store_dword v[0:1], v0, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index a407cd20bf762..67c76578e012c 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -99,9 +99,7 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
 ; GCN-LABEL: break_cond_is_arg:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s10, 1
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_branch .LBB2_2
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index b8e74bc7db09a..6fe55fbdfbe9a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -8,13 +8,9 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-LABEL: machinesink_loop_variable_out_of_divergent_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v1, 1, v1
-; CHECK-NEXT:    v_and_b32_e32 v3, 1, v3
-; CHECK-NEXT:    s_mov_b32 s5, 0
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s4, 1, v1
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    s_xor_b32 s6, s4, -1
+; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
 ; CHECK-NEXT:    s_branch .LBB0_3
 ; CHECK-NEXT:    .p2align 6
@@ -25,12 +21,12 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:  .LBB0_2: ; %Flow1
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; j lastloop entry
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_or_b32 s5, s4, s5
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_8
 ; CHECK-NEXT:  .LBB0_3: ; %for.body33
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
@@ -44,35 +40,34 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    s_branch .LBB0_6
-; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_5: ; %if.end118
 ; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s10
 ; CHECK-NEXT:    s_add_i32 s9, s9, 4
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; backedge
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_add_nc_u32_e32 v4, s9, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s4, v4, v0
-; CHECK-NEXT:    s_or_b32 s8, s4, s8
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, s9, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
+; CHECK-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_1
 ; CHECK-NEXT:  .LBB0_6: ; %for.body51
 ; CHECK-NEXT:    ; Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 1
-; CHECK-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; CHECK-NEXT:    s_and_saveexec_b32 s10, s5
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_5
 ; CHECK-NEXT:  ; %bb.7: ; %if.then112
 ; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT:    s_add_i32 s10, s9, 4
+; CHECK-NEXT:    s_add_i32 s11, s9, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    ds_write_b32 v1, v4
+; CHECK-NEXT:    v_mov_b32_e32 v4, s11
+; CHECK-NEXT:    ds_write_b32 v2, v4
 ; CHECK-NEXT:    s_branch .LBB0_5
 ; CHECK-NEXT:  .LBB0_8: ; %for.body159.preheader
 ; CHECK-NEXT:    s_inst_prefetch 0x2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_mov_b32 vcc_lo, exec_lo
 ; CHECK-NEXT:  .LBB0_9: ; %for.body159
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 1e9994dd8e6ef..85d44460ce3dc 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -54,10 +54,8 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
 ; GFX9-LABEL: lsr_order_mul24_1:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v18
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
-; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX9-NEXT:  ; %bb.1: ; %bb19
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v6
@@ -83,11 +81,11 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v18, v19
 ; GFX9-NEXT:    v_sub_u32_e32 v12, v12, v19
 ; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4]
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v20, v13
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v20, v13
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[6:7], v12, v14
-; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, v18, s[4:5]
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT:    s_and_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v18, vcc
 ; GFX9-NEXT:    v_lshlrev_b64 v[18:19], 2, v[3:4]
 ; GFX9-NEXT:    v_add_co_u32_e64 v18, s[6:7], v10, v18
 ; GFX9-NEXT:    v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7]
@@ -95,7 +93,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v0, v1
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; GFX9-NEXT:    ds_write_b32 v6, v3
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 13f8eff94f86b..f6f3128a1dec9 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -17,14 +17,10 @@ define void @nested_inf_loop(i1 %0, i1 %1) {
 ; ISA-LABEL: nested_inf_loop:
 ; ISA-NEXT: %bb.0:                                ; %BB
 ; ISA-NEXT: 	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ISA-NEXT: 	v_and_b32_e32 v1, 1, v1
-; ISA-NEXT: 	v_and_b32_e32 v0, 1, v0
-; ISA-NEXT: 	v_cmp_eq_u32_e64 s[4:5], 1, v1
-; ISA-NEXT: 	v_cmp_eq_u32_e32 vcc, 1, v0
-; ISA-NEXT: 	s_xor_b64 s[6:7], vcc, -1
+; ISA-NEXT: 	s_xor_b64 s[4:5], s[4:5], -1
 ; ISA-NEXT: 	s_mov_b64 s[8:9], 0
 ; ISA-NEXT: .LBB0_1:                                ; %BB1
-; ISA: 	      s_and_b64 s[10:11], exec, s[6:7]
+; ISA: 	      s_and_b64 s[10:11], exec, s[4:5]
 ; ISA-NEXT: 	s_or_b64 s[8:9], s[10:11], s[8:9]
 ; ISA-NEXT: 	s_andn2_b64 exec, exec, s[8:9]
 ; ISA-NEXT: 	s_cbranch_execnz .LBB0_1
@@ -32,7 +28,7 @@ define void @nested_inf_loop(i1 %0, i1 %1) {
 ; ISA: 	      s_or_b64 exec, exec, s[8:9]
 ; ISA-NEXT: 	s_mov_b64 s[8:9], 0
 ; ISA-NEXT: .LBB0_3:                                ; %BB4
-; ISA: 	      s_and_b64 s[10:11], exec, s[4:5]
+; ISA: 	      s_and_b64 s[10:11], exec, s[6:7]
 ; ISA-NEXT: 	s_or_b64 s[8:9], s[10:11], s[8:9]
 ; ISA-NEXT: 	s_andn2_b64 exec, exec, s[8:9]
 ; ISA-NEXT: 	s_cbranch_execnz .LBB0_3
diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
index d34769ad0fcf0..e77e8dd2a3821 100644
--- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
@@ -10,12 +10,10 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
 ; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GCN-NEXT:    s_cbranch_execnz .LBB0_3
@@ -27,18 +25,18 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-NEXT:  .LBB0_3: ; %bb2
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, v3
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
-; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  .LBB0_4: ; %bb1
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, v3
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 126b17e718b59..8e22aa65c68b9 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -6,12 +6,11 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x1f5a814b
-; CHECK-NEXT:    s_mov_b32 s5, 0x52bf5b
 ; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0x295fad, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x52bf5b
+; CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem i29 %X, 99
   %cmp = icmp eq i29 %srem, 0
@@ -31,8 +30,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v1, 6, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_and_b32_e32 v0, 15, v0
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem i4 %X, 6
   %cmp = icmp eq i4 %srem, 1
@@ -49,8 +47,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 ; CHECK-NEXT:    v_and_b32_e32 v1, 60, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_and_b32_e32 v0, 63, v0
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem i6 %X, 4
   %cmp = icmp ne i6 %srem, 0
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index c6a599094fe43..f32ca994589d4 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -203,33 +203,29 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-OPT-LABEL: func_stacksave_nonentry_block:
 ; WAVE32-OPT:       ; %bb.0: ; %bb0
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WAVE32-OPT-NEXT:    v_and_b32_e32 v0, 1, v0
-; WAVE32-OPT-NEXT:    s_mov_b32 s4, exec_lo
-; WAVE32-OPT-NEXT:    v_cmpx_eq_u32_e32 1, v0
+; WAVE32-OPT-NEXT:    s_and_saveexec_b32 s5, s4
 ; WAVE32-OPT-NEXT:    s_cbranch_execz .LBB4_2
 ; WAVE32-OPT-NEXT:  ; %bb.1: ; %bb1
-; WAVE32-OPT-NEXT:    s_lshr_b32 s5, s32, 5
+; WAVE32-OPT-NEXT:    s_lshr_b32 s4, s32, 5
 ; WAVE32-OPT-NEXT:    ;;#ASMSTART
-; WAVE32-OPT-NEXT:    ; use s5
+; WAVE32-OPT-NEXT:    ; use s4
 ; WAVE32-OPT-NEXT:    ;;#ASMEND
 ; WAVE32-OPT-NEXT:  .LBB4_2: ; %bb2
-; WAVE32-OPT-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; WAVE32-OPT-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stacksave_nonentry_block:
 ; WAVE64-OPT:       ; %bb.0: ; %bb0
 ; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WAVE64-OPT-NEXT:    v_and_b32_e32 v0, 1, v0
-; WAVE64-OPT-NEXT:    s_mov_b64 s[4:5], exec
-; WAVE64-OPT-NEXT:    v_cmpx_eq_u32_e32 1, v0
+; WAVE64-OPT-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; WAVE64-OPT-NEXT:    s_cbranch_execz .LBB4_2
 ; WAVE64-OPT-NEXT:  ; %bb.1: ; %bb1
-; WAVE64-OPT-NEXT:    s_lshr_b32 s6, s32, 6
+; WAVE64-OPT-NEXT:    s_lshr_b32 s4, s32, 6
 ; WAVE64-OPT-NEXT:    ;;#ASMSTART
-; WAVE64-OPT-NEXT:    ; use s6
+; WAVE64-OPT-NEXT:    ; use s4
 ; WAVE64-OPT-NEXT:    ;;#ASMEND
 ; WAVE64-OPT-NEXT:  .LBB4_2: ; %bb2
-; WAVE64-OPT-NEXT:    s_or_b64 exec, exec, s[4:5]
+; WAVE64-OPT-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stacksave_nonentry_block:
@@ -238,20 +234,13 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; WAVE32-O0-NEXT:    v_mov_b32_e32 v1, v0
-; WAVE32-O0-NEXT:    s_or_saveexec_b32 s7, -1
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s7
-; WAVE32-O0-NEXT:    v_and_b32_e64 v1, 1, v1
-; WAVE32-O0-NEXT:    v_cmp_eq_u32_e64 s5, v1, 1
+; WAVE32-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, exec_lo
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-O0-NEXT:    v_writelane_b32 v0, s4, 0
 ; WAVE32-O0-NEXT:    s_or_saveexec_b32 s7, -1
 ; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s7
-; WAVE32-O0-NEXT:    s_and_b32 s4, s4, s5
+; WAVE32-O0-NEXT:    s_and_b32 s4, s4, s6
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
 ; WAVE32-O0-NEXT:    s_cbranch_execz .LBB4_2
 ; WAVE32-O0-NEXT:  ; %bb.1: ; %bb1
@@ -280,15 +269,8 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; WAVE64-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; WAVE64-O0-NEXT:    v_mov_b32_e32 v1, v0
-; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; WAVE64-O0-NEXT:    s_mov_b64 exec, s[10:11]
-; WAVE64-O0-NEXT:    v_and_b32_e64 v1, 1, v1
-; WAVE64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, 1
+; WAVE64-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; WAVE64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE64-O0-NEXT:    v_writelane_b32 v0, s4, 0
 ; WAVE64-O0-NEXT:    v_writelane_b32 v0, s5, 1
 ; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
@@ -322,14 +304,12 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-WWM-PREALLOC:       ; %bb.0: ; %bb0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; WAVE32-WWM-PREALLOC-NEXT:    v_and_b32_e64 v0, 1, v0
-; WAVE32-WWM-PREALLOC-NEXT:    v_cmp_eq_u32_e64 s5, v0, 1
+; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, exec_lo
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v1, s4, 0
-; WAVE32-WWM-PREALLOC-NEXT:    s_and_b32 s4, s4, s5
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v0, s4, 0
+; WAVE32-WWM-PREALLOC-NEXT:    s_and_b32 s4, s4, s6
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
 ; WAVE32-WWM-PREALLOC-NEXT:    s_cbranch_execz .LBB4_2
 ; WAVE32-WWM-PREALLOC-NEXT:  ; %bb.1: ; %bb1
@@ -339,11 +319,11 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s4
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
 ; WAVE32-WWM-PREALLOC-NEXT:  .LBB4_2: ; %bb2
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s4, v1, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s4, v0, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr1
+; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
 ; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index a0dd0e7e78f9d..0de7658dc39c4 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -6,11 +6,10 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
-; CHECK-NEXT:    s_movk_i32 s4, 0x667
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0xccd, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_movk_i32 s4, 0x667
+; CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i13 %X, 5
   %cmp = icmp eq i13 %urem, 0
@@ -22,14 +21,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6db6db7
-; CHECK-NEXT:    s_mov_b32 s5, 0x924925
 ; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 26, v0
 ; CHECK-NEXT:    v_bfe_u32 v0, v0, 1, 26
 ; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x924925
+; CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i27 %X, 14
   %cmp = icmp eq i27 %urem, 0
@@ -43,8 +41,7 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 15, v0
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 13, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 15, v0
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 3, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[4:5], 3, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i4 %X, 5
   %cmp = icmp ne i4 %urem, 0
@@ -58,8 +55,7 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0x133, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[4:5], 1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i9 %X, -5
   %cmp = icmp ne i9 %urem, 0

>From c0dfff71a591278bb15d328e6cbd804229c7d8ab Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 13 May 2024 16:41:49 -0500
Subject: [PATCH 19/25] Testcase updates.

---
 .../AMDGPU/GlobalISel/function-i1-args.ll     | 142 -----
 llvm/test/CodeGen/AMDGPU/function-i1-args.ll  | 514 +++++++++++++++++-
 2 files changed, 511 insertions(+), 145 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index 47c4682196d60..8fdd512a1c61a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -706,41 +706,6 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
   ret void
 }
 
-define void @test_call_void_func_i1_i1_inreg() {
-; GFX9-LABEL: name: test_call_void_func_i1_i1_inreg
-; GFX9: bb.1 (%ir-block.0):
-; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
-; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_i1_inreg]]
-; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
-; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX9-NEXT:    $sgpr6 = COPY [[ANYEXT]](s32)
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT:    SI_RETURN
-;
-; GFX11-LABEL: name: test_call_void_func_i1_i1_inreg
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
-; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_i1_inreg]]
-; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX11-NEXT:    $sgpr1 = COPY [[ANYEXT]](s32)
-; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
-; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT:    SI_RETURN
-  %val = load i1, ptr addrspace(1) undef
-  call void @void_func_i1_i1_inreg(i1 %val, i1 inreg true)
-  ret void
-}
-
 define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
 ; GFX9-LABEL: name: void_func_i1_inreg_i1
 ; GFX9: bb.1 (%ir-block.0):
@@ -770,42 +735,6 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
   ret void
 }
 
-define void @test_call_void_func_i1_inreg_i1() {
-; GFX9-LABEL: name: test_call_void_func_i1_inreg_i1
-; GFX9: bb.1 (%ir-block.0):
-; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
-; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_i1]]
-; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT:    $sgpr4 = COPY [[ANYEXT]](s32)
-; GFX9-NEXT:    $sgpr6_sgpr7 = COPY [[CONST]](s1)
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT:    SI_RETURN
-;
-; GFX11-LABEL: name: test_call_void_func_i1_inreg_i1
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
-; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_i1]]
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
-; GFX11-NEXT:    $sgpr1 = COPY [[CONST]](s1)
-; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
-; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT:    SI_RETURN
-
-  %val = load i1, ptr addrspace(1) undef
-  call void @void_func_i1_inreg_i1(i1 inreg %val, i1 true)
-  ret void
-}
-
 define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
 ; GFX9-LABEL: name: void_func_zeroext_i1_i1_inreg
 ; GFX9: bb.1 (%ir-block.0):
@@ -835,41 +764,6 @@ define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
   ret void
 }
 
-define void @test_call_void_func_zeroext_i1_i1_inreg() {
-; GFX9-LABEL: name: test_call_void_func_zeroext_i1_i1_inreg
-; GFX9: bb.1 (%ir-block.0):
-; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
-; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_zeroext_i1_i1_inreg]]
-; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
-; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX9-NEXT:    $sgpr6 = COPY [[ANYEXT]](s32)
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT:    SI_RETURN
-;
-; GFX11-LABEL: name: test_call_void_func_zeroext_i1_i1_inreg
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
-; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_zeroext_i1_i1_inreg]]
-; GFX11-NEXT:    $sgpr0 = COPY [[LOAD]](s1)
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX11-NEXT:    $sgpr1 = COPY [[ANYEXT]](s32)
-; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
-; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT:    SI_RETURN
-  %val = load i1, ptr addrspace(1) undef
-  call void @void_func_zeroext_i1_i1_inreg(i1 zeroext %val, i1 inreg true)
-  ret void
-}
-
 define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
 ; GFX9-LABEL: name: void_func_i1_inreg_zeroext_i1
 ; GFX9: bb.1 (%ir-block.0):
@@ -899,42 +793,6 @@ define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
   ret void
 }
 
-define void @test_call_void_func_i1_inreg_zeroext_i1() {
-; GFX9-LABEL: name: test_call_void_func_i1_inreg_zeroext_i1
-; GFX9: bb.1 (%ir-block.0):
-; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
-; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_inreg_zeroext_i1
-; GFX9-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT:    $sgpr4 = COPY [[ANYEXT]](s32)
-; GFX9-NEXT:    $sgpr6_sgpr7 = COPY [[CONST]](s1)
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_inreg_zeroext_i1, csr_amdgpu, implicit $sgpr4, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT:    SI_RETURN
-;
-; GFX11-LABEL: name: test_call_void_func_i1_inreg_zeroext_i1
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
-; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true  
-; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_zeroext_i1]]
-; GFX11-NEXT:    [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT:    $sgpr0 = COPY [[ANYEXT]](s32)
-; GFX11-NEXT:    $sgpr1 = COPY [[CONST]](s1)
-; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
-; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT:    SI_RETURN
-
-  %val = load i1, ptr addrspace(1) undef
-  call void @void_func_i1_inreg_zeroext_i1(i1 inreg %val, i1 zeroext true)
-  ret void
-}
-
 define void @void_func_signext_i1_i1_inreg(i1 signext %arg0, i1 inreg %arg1) {
 ; GFX9-LABEL: name: void_func_signext_i1_i1_inreg
 ; GFX9: bb.1 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
index 2d63695674404..caf0879671d85 100644
--- a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -733,8 +733,8 @@ define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
   ret void
 }
 
-define void @many_i1_args(
-; GFX9-LABEL: many_i1_args:
+define void @exhaust_sgprs_by_i1_args(
+; GFX9-LABEL: exhaust_sgprs_by_i1_args:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
@@ -835,7 +835,7 @@ define void @many_i1_args(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: many_i1_args:
+; GFX11-LABEL: exhaust_sgprs_by_i1_args:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -978,6 +978,327 @@ define void @many_i1_args(
   ret void
 }
 
+define void @void_func_a64i1([64 x i1] %arg0) {
+; GFX9-LABEL: void_func_a64i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX9-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX9-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX9-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX9-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX9-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX9-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX9-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX9-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX9-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX9-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX9-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX9-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX9-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    global_store_byte v[0:1], v30, off
+; GFX9-NEXT:    global_store_byte v[0:1], v29, off
+; GFX9-NEXT:    global_store_byte v[0:1], v28, off
+; GFX9-NEXT:    global_store_byte v[0:1], v27, off
+; GFX9-NEXT:    global_store_byte v[0:1], v26, off
+; GFX9-NEXT:    global_store_byte v[0:1], v25, off
+; GFX9-NEXT:    global_store_byte v[0:1], v24, off
+; GFX9-NEXT:    global_store_byte v[0:1], v23, off
+; GFX9-NEXT:    global_store_byte v[0:1], v22, off
+; GFX9-NEXT:    global_store_byte v[0:1], v21, off
+; GFX9-NEXT:    global_store_byte v[0:1], v20, off
+; GFX9-NEXT:    global_store_byte v[0:1], v19, off
+; GFX9-NEXT:    global_store_byte v[0:1], v18, off
+; GFX9-NEXT:    global_store_byte v[0:1], v17, off
+; GFX9-NEXT:    global_store_byte v[0:1], v16, off
+; GFX9-NEXT:    global_store_byte v[0:1], v15, off
+; GFX9-NEXT:    global_store_byte v[0:1], v14, off
+; GFX9-NEXT:    global_store_byte v[0:1], v13, off
+; GFX9-NEXT:    global_store_byte v[0:1], v12, off
+; GFX9-NEXT:    global_store_byte v[0:1], v11, off
+; GFX9-NEXT:    global_store_byte v[0:1], v10, off
+; GFX9-NEXT:    global_store_byte v[0:1], v9, off
+; GFX9-NEXT:    global_store_byte v[0:1], v8, off
+; GFX9-NEXT:    global_store_byte v[0:1], v7, off
+; GFX9-NEXT:    global_store_byte v[0:1], v6, off
+; GFX9-NEXT:    global_store_byte v[0:1], v5, off
+; GFX9-NEXT:    global_store_byte v[0:1], v4, off
+; GFX9-NEXT:    global_store_byte v[0:1], v3, off
+; GFX9-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    buffer_load_ubyte v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT:    global_store_byte v[0:1], v31, off
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    global_store_byte v[0:1], v1, off
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[28:29]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[26:27]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[24:25]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[22:23]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[20:21]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[18:19]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[16:17]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[14:15]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[12:13]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[10:11]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a64i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u8 v33, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u8 v34, off, s32
+; GFX11-NEXT:    v_cndmask_b32_e64 v35, 0, 1, s29
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v36, 0, 1, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v37, 0, 1, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v38, 0, 1, s26
+; GFX11-NEXT:    v_cndmask_b32_e64 v39, 0, 1, s25
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v48, 0, 1, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v49, 0, 1, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v50, 0, 1, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v51, 0, 1, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v52, 0, 1, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v53, 0, 1, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v54, 0, 1, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v55, 0, 1, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v64, 0, 1, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v65, 0, 1, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v66, 0, 1, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v67, 0, 1, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v68, 0, 1, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v69, 0, 1, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v70, 0, 1, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v71, 0, 1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v80, 0, 1, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v81, 0, 1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v82, 0, 1, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v83, 0, 1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, 0, 1, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v85, 0, 1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, 0, 1, s1
+; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    global_store_b8 v[0:1], v35, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v36, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v37, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v38, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v39, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v48, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v49, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v50, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v51, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v52, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v53, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v54, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v55, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v64, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v65, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v66, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v67, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v68, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v69, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v70, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v71, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v80, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v81, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v82, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v83, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v84, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v85, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v86, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v87, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v30, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v29, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v28, off
+; GFX11-NEXT:    s_clause 0x19
+; GFX11-NEXT:    global_store_b8 v[0:1], v27, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v26, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v25, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v24, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v23, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v22, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v21, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v20, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v19, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v18, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v17, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v16, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v15, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v14, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v13, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v12, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v11, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v10, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v9, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v8, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v7, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v6, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v5, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX11-NEXT:    v_cndmask_b32_e64 v31, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v33
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v34
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v31, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store [64 x i1] %arg0, ptr addrspace(1) undef
+  ret void
+}
+
 define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
 ; GFX9-LABEL: void_func_i1_i1_inreg:
 ; GFX9:       ; %bb.0:
@@ -1008,6 +1329,76 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
   ret void
 }
 
+define void @test_call_void_func_i1_i1_inreg() {
+; GFX9-LABEL: test_call_void_func_i1_i1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s7, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_i1_i1_inreg at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_i1_i1_inreg at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    s_mov_b32 s6, 1
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_void_func_i1_i1_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, void_func_i1_i1_inreg at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_i1_i1_inreg at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s1, 1
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load i1, ptr addrspace(1) undef
+  call void @void_func_i1_i1_inreg(i1 %val, i1 inreg true)
+  ret void
+}
+
 define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
 ; GFX9-LABEL: void_func_i1_inreg_i1:
 ; GFX9:       ; %bb.0:
@@ -1037,3 +1428,120 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
   ret void
 }
 
+define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: void_func_zeroext_i1_i1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    s_and_b32 s4, s6, 1
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_zeroext_i1_i1_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_and_b32 s0, s1, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
+; GFX9-LABEL: void_func_i1_inreg_zeroext_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg_zeroext_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_signext_i1_i1_inreg(i1 signext %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: void_func_signext_i1_i1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    s_and_b32 s4, s6, 1
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_signext_i1_i1_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_and_b32 s0, s1, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i1_inreg_signext_i1(i1 inreg %arg0, i1 signext %arg1) {
+; GFX9-LABEL: void_func_i1_inreg_signext_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg_signext_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  ret void
+}

>From 6f2289b8a7e896765d5fa14dda3c87052f9c8a5a Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 13 May 2024 17:44:52 -0500
Subject: [PATCH 20/25] Fix test file after merge from main.

---
 .../CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 5d2f794b94c4d..f0ab1b25d6f03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -101,8 +101,8 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY]], [[C]]
-  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
-  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.2
+  ; CHECK-NEXT:   [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
+  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.2
   ; CHECK-NEXT:   G_BR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:

>From 265d5c66da7047cebf436daff62eeb22969f5d78 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Tue, 14 May 2024 17:18:27 -0500
Subject: [PATCH 21/25] (1) Fix a problem with reserving ScratchRSrcD (2)
 update test files.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |   30 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h   |   12 +-
 .../GlobalISel/function-call-i1-return.ll     |  304 ++++-
 .../AMDGPU/GlobalISel/function-i1-args.ll     |   81 ++
 llvm/test/CodeGen/AMDGPU/allow-check.ll       |   18 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |   16 +-
 .../CodeGen/AMDGPU/function-call-i1-return.ll |  505 ++++++++
 llvm/test/CodeGen/AMDGPU/function-i1-args.ll  |    1 -
 .../AMDGPU/global_atomics_scan_fadd.ll        | 1048 ++++++++---------
 .../AMDGPU/global_atomics_scan_fmax.ll        |  708 ++++++-----
 .../AMDGPU/global_atomics_scan_fmin.ll        |  708 ++++++-----
 .../AMDGPU/global_atomics_scan_fsub.ll        | 1048 ++++++++---------
 .../AMDGPU/lds-global-non-entry-func.ll       |    6 -
 13 files changed, 2614 insertions(+), 1871 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index c69cf8c34a6b2..9217370193156 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -317,8 +317,12 @@ bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
   return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
 }
 
-/// Special handling for i1 return val: based on determineAndHandleAssignments()
-bool AMDGPUCallLowering::determineAndHandleAssignmentsForI1Return(
+/// Replace CallLowering::determineAndHandleAssignments() because we need to
+/// reserve ScratchRSrcReg when necessary.
+/// TODO: Investigate if reserving ScratchRSrcReg can be moved to calling conv
+/// functions. If so, then this function is not needed anymore -- we can just
+/// use CallLowering::determineAndHandleAssignments() as before.
+bool AMDGPUCallLowering::determineAndHandleAssignmentsLocal(
     ValueHandler &Handler, ValueAssigner &Assigner,
     SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
     CallingConv::ID CallConv, bool IsVarArg) const {
@@ -405,12 +409,8 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
   OutgoingValueAssigner Assigner(AssignFn);
   AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
 
-  if (SplitEVTs.size() == 1 && SplitEVTs[0] == MVT::i1)
-    return determineAndHandleAssignmentsForI1Return(
-        RetHandler, Assigner, SplitRetInfos, B, CC, F.isVarArg());
-  else
-    return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
-                                         CC, F.isVarArg());
+  return determineAndHandleAssignmentsLocal(RetHandler, Assigner, SplitRetInfos,
+                                            B, CC, F.isVarArg());
 }
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
@@ -1575,16 +1575,10 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                                       Info.IsVarArg);
     IncomingValueAssigner Assigner(RetAssignFn);
     CallReturnHandler Handler(MIRBuilder, MRI, MIB);
-    if (Info.OrigRet.Ty->isIntegerTy(1)) {
-      if (!determineAndHandleAssignmentsForI1Return(Handler, Assigner, InArgs,
-                                                    MIRBuilder, Info.CallConv,
-                                                    Info.IsVarArg))
-        return false;
-    } else {
-      if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
-                                         Info.CallConv, Info.IsVarArg))
-        return false;
-    }
+    if (!determineAndHandleAssignmentsLocal(Handler, Assigner, InArgs,
+                                            MIRBuilder, Info.CallConv,
+                                            Info.IsVarArg))
+      return false;
   }
 
   uint64_t CalleePopBytes = NumBytes;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index afe3a7a19601a..f9b8599e8ce8c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,12 +37,12 @@ class AMDGPUCallLowering final : public CallLowering {
   bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
                       ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
 
-  bool determineAndHandleAssignmentsForI1Return(ValueHandler &Handler,
-                                                ValueAssigner &Assigner,
-                                                SmallVectorImpl<ArgInfo> &Args,
-                                                MachineIRBuilder &MIRBuilder,
-                                                CallingConv::ID CallConv,
-                                                bool IsVarArg) const;
+  bool determineAndHandleAssignmentsLocal(ValueHandler &Handler,
+                                          ValueAssigner &Assigner,
+                                          SmallVectorImpl<ArgInfo> &Args,
+                                          MachineIRBuilder &MIRBuilder,
+                                          CallingConv::ID CallConv,
+                                          bool IsVarArg) const;
 
 public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index a022c13f38f9a..679eb28d4a04c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -193,6 +193,112 @@ define void @test_call_inreg_i1_func_void() {
   ret void
 }
 
+define signext inreg i1 @signext_inreg_i1_func_void() {
+; GFX9-LABEL: name: signext_inreg_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[EXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
+; GFX9-NEXT:    $vgpr0 = COPY [[EXT]](s32)
+; GFX9-NEXT:    SI_RETURN implicit $vgpr0
+;
+; GFX11-LABEL: name: signext_inreg_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[EXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
+; GFX11-NEXT:    $vgpr0 = COPY [[EXT]](s32)
+; GFX11-NEXT:    SI_RETURN implicit $vgpr0
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_signext_inreg_i1_func_void() {
+; GFX9-LABEL: name: test_call_signext_inreg_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:signext_inreg_i1_func_void]]
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT:    [[ASSERTEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY2]], 1
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERTEXT]](s32)
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_signext_inreg_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:signext_inreg_i1_func_void]]
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit-def $vgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:    [[ASSERTEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 1
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERTEXT]](s32)
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %val = call i1 @signext_inreg_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define zeroext inreg i1 @zeroext_inreg_i1_func_void() {
+; GFX9-LABEL: name: zeroext_inreg_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[EXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
+; GFX9-NEXT:    $vgpr0 = COPY [[EXT]](s32)
+; GFX9-NEXT:    SI_RETURN implicit $vgpr0
+;
+; GFX11-LABEL: name: zeroext_inreg_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[EXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
+; GFX11-NEXT:    $vgpr0 = COPY [[EXT]](s32)
+; GFX11-NEXT:    SI_RETURN implicit $vgpr0
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_zeroext_inreg_i1_func_void() {
+; GFX9-LABEL: name: test_call_zeroext_inreg_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:zeroext_inreg_i1_func_void]]
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT:    [[ASSERTEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY2]], 1
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERTEXT]](s32)
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_zeroext_inreg_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:zeroext_inreg_i1_func_void]]
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit-def $vgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:    [[ASSERTEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 1
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERTEXT]](s32)
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  %val = call i1 @zeroext_inreg_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
 define [2 x i1] @a2i1_func_void() {
 ; GFX9-LABEL: name: a2i1_func_void
 ; GFX9: bb.1 (%ir-block.0):
@@ -201,8 +307,8 @@ define [2 x i1] @a2i1_func_void() {
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
 ; GFX9-NEXT:    [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
 ; GFX9-NEXT:    [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
-; GFX9-NEXT:    $sgpr0_sgpr1 = COPY [[LOAD]](s1)
-; GFX9-NEXT:    $sgpr2_sgpr3 = COPY [[LOAD2]](s1)
+; GFX9-NEXT:    $sgpr4_sgpr5 = COPY [[LOAD]](s1)
+; GFX9-NEXT:    $sgpr6_sgpr7 = COPY [[LOAD2]](s1)
 ; GFX9-NEXT:    SI_RETURN
 ;
 ; GFX11-LABEL: name: a2i1_func_void
@@ -227,9 +333,9 @@ define void @test_call_a2i1_func_void() {
 ; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a2i1_func_void
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1, implicit-def $sgpr2_sgpr3
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
-; GFX9-NEXT:    [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5, implicit-def $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
@@ -256,3 +362,191 @@ define void @test_call_a2i1_func_void() {
   ret void
 }
 
+define [16 x i1] @a16i1_func_void(ptr addrspace(1) %in) {
+; GFX9-LABEL: name: a16i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $vgpr0, $vgpr1, $vgpr2
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+; GFX9-NEXT:    [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+; GFX9-NEXT:    [[MERGE:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[MERGE]](p1) :: (load (s1) from %ir.in, addrspace 1)
+
+; GFX9-NEXT:    [[CONST1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT:    [[PTRADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST1]](s64)
+; GFX9-NEXT:    [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD1]](p1) :: (load (s1) from %ir.in + 1, addrspace 1)
+; GFX9-NEXT:    [[CONST2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+; GFX9-NEXT:    [[PTRADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST2]](s64)
+; GFX9-NEXT:    [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD2]](p1) :: (load (s1) from %ir.in + 2, addrspace 1)
+; GFX9-NEXT:    [[CONST3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+; GFX9-NEXT:    [[PTRADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST3]](s64)
+; GFX9-NEXT:    [[LOAD3:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD3]](p1) :: (load (s1) from %ir.in + 3, addrspace 1)
+; GFX9-NEXT:    [[CONST4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+; GFX9-NEXT:    [[PTRADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST4]](s64)
+; GFX9-NEXT:    [[LOAD4:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD4]](p1) :: (load (s1) from %ir.in + 4, addrspace 1)
+; GFX9-NEXT:    [[CONST5:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+; GFX9-NEXT:    [[PTRADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST5]](s64)
+; GFX9-NEXT:    [[LOAD5:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD5]](p1) :: (load (s1) from %ir.in + 5, addrspace 1)
+; GFX9-NEXT:    [[CONST6:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+; GFX9-NEXT:    [[PTRADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST6]](s64)
+; GFX9-NEXT:    [[LOAD6:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD6]](p1) :: (load (s1) from %ir.in + 6, addrspace 1)
+; GFX9-NEXT:    [[CONST7:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+; GFX9-NEXT:    [[PTRADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST7]](s64)
+; GFX9-NEXT:    [[LOAD7:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD7]](p1) :: (load (s1) from %ir.in + 7, addrspace 1)
+; GFX9-NEXT:    [[CONST8:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+; GFX9-NEXT:    [[PTRADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST8]](s64)
+; GFX9-NEXT:    [[LOAD8:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD8]](p1) :: (load (s1) from %ir.in + 8, addrspace 1)
+; GFX9-NEXT:    [[CONST9:%[0-9]+]]:_(s64) = G_CONSTANT i64 9
+; GFX9-NEXT:    [[PTRADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST9]](s64)
+; GFX9-NEXT:    [[LOAD9:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD9]](p1) :: (load (s1) from %ir.in + 9, addrspace 1)
+; GFX9-NEXT:    [[CONST10:%[0-9]+]]:_(s64) = G_CONSTANT i64 10
+; GFX9-NEXT:    [[PTRADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST10]](s64)
+; GFX9-NEXT:    [[LOAD10:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD10]](p1) :: (load (s1) from %ir.in + 10, addrspace 1)
+; GFX9-NEXT:    [[CONST11:%[0-9]+]]:_(s64) = G_CONSTANT i64 11
+; GFX9-NEXT:    [[PTRADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST11]](s64)
+; GFX9-NEXT:    [[LOAD11:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD11]](p1) :: (load (s1) from %ir.in + 11, addrspace 1)
+; GFX9-NEXT:    [[CONST12:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+; GFX9-NEXT:    [[PTRADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST12]](s64)
+; GFX9-NEXT:    [[LOAD12:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD12]](p1) :: (load (s1) from %ir.in + 12, addrspace 1)
+; GFX9-NEXT:    [[CONST13:%[0-9]+]]:_(s64) = G_CONSTANT i64 13
+; GFX9-NEXT:    [[PTRADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST13]](s64)
+; GFX9-NEXT:    [[LOAD13:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD13]](p1) :: (load (s1) from %ir.in + 13, addrspace 1)
+; GFX9-NEXT:    [[CONST14:%[0-9]+]]:_(s64) = G_CONSTANT i64 14
+; GFX9-NEXT:    [[PTRADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST14]](s64)
+; GFX9-NEXT:    [[LOAD14:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD14]](p1) :: (load (s1) from %ir.in + 14, addrspace 1)
+; GFX9-NEXT:    [[CONST15:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+; GFX9-NEXT:    [[PTRADD15:%[0-9]+]]:_(p1) = G_PTR_ADD [[MERGE]], [[CONST15]](s64)
+; GFX9-NEXT:    [[LOAD15:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD15]](p1) :: (load (s1) from %ir.in + 15, addrspace 1)
+
+  %val = load [16 x i1], ptr addrspace(1) %in
+  ret [16 x i1] %val
+}
+
+define void @test_call_a16i1_func_void(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GFX9-LABEL: name: test_call_a16i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+; GFX9-NEXT: {{  $}}
+; GFX9:         [[FRAME:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+; GFX9:         [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; GFX9-NEXT:    [[PTRADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST1]](s32)
+; GFX9-NEXT:    [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD1]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+; GFX9-NEXT:    [[PTRADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST2]](s32)
+; GFX9-NEXT:    [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD2]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+; GFX9-NEXT:    [[PTRADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST3]](s32)
+; GFX9-NEXT:    [[LOAD3:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD3]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+; GFX9-NEXT:    [[PTRADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST4]](s32)
+; GFX9-NEXT:    [[LOAD4:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD4]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+; GFX9-NEXT:    [[PTRADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST5]](s32)
+; GFX9-NEXT:    [[LOAD5:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD5]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+; GFX9-NEXT:    [[PTRADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST6]](s32)
+; GFX9-NEXT:    [[LOAD6:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD6]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+; GFX9-NEXT:    [[PTRADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST7]](s32)
+; GFX9-NEXT:    [[LOAD7:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD7]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+; GFX9-NEXT:    [[PTRADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST8]](s32)
+; GFX9-NEXT:    [[LOAD8:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD8]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST9:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+; GFX9-NEXT:    [[PTRADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST9]](s32)
+; GFX9-NEXT:    [[LOAD9:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD9]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST10:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+; GFX9-NEXT:    [[PTRADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST10]](s32)
+; GFX9-NEXT:    [[LOAD10:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD10]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST11:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+; GFX9-NEXT:    [[PTRADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST11]](s32)
+; GFX9-NEXT:    [[LOAD11:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD11]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST12:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX9-NEXT:    [[PTRADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST12]](s32)
+; GFX9-NEXT:    [[LOAD12:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD12]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST13:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+; GFX9-NEXT:    [[PTRADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST13]](s32)
+; GFX9-NEXT:    [[LOAD13:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD13]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST14:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+; GFX9-NEXT:    [[PTRADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST14]](s32)
+; GFX9-NEXT:    [[LOAD14:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD14]](p5) :: (load (s1) from %stack.0, addrspace 5)
+; GFX9-NEXT:    [[CONST15:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+; GFX9-NEXT:    [[PTRADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME]], [[CONST15]](s32)
+; GFX9-NEXT:    [[LOAD15:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD15]](p5) :: (load (s1) from %stack.0, addrspace 5)
+
+  %val = call [16 x i1] @a16i1_func_void(ptr addrspace(1) %in)
+  store volatile [16 x i1] %val, ptr addrspace(1) %out
+  ret void
+}
+
+define <2 x i1> @v2i1_func_void() {
+; GFX9-LABEL: name: v2i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(<2 x s1>) = G_LOAD [[DEF]](p1) :: (load (<2 x s1>) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    [[UNMERGE:%[0-9]+]]:_(s1), [[UNMERGE1:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[LOAD]](<2 x s1>)
+; GFX9-NEXT:    [[EXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UNMERGE]](s1)
+; GFX9-NEXT:    [[EXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UNMERGE1]](s1)
+; GFX9-NEXT:    [[EXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EXT]](s16)
+; GFX9-NEXT:    $vgpr0 = COPY [[EXT2]](s32)
+; GFX9-NEXT:    [[EXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EXT1]](s16)
+; GFX9-NEXT:    $vgpr1 = COPY [[EXT3]](s32)
+; GFX9-NEXT:    SI_RETURN implicit $vgpr0, implicit $vgpr1
+;
+; GFX11-LABEL: name: v2i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(<2 x s1>) = G_LOAD [[DEF]](p1) :: (load (<2 x s1>) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    [[UNMERGE:%[0-9]+]]:_(s1), [[UNMERGE1:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[LOAD]](<2 x s1>)
+; GFX11-NEXT:    [[EXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UNMERGE]](s1)
+; GFX11-NEXT:    [[EXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UNMERGE1]](s1)
+; GFX11-NEXT:    [[EXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EXT]](s16)
+; GFX11-NEXT:    $vgpr0 = COPY [[EXT2]](s32)
+; GFX11-NEXT:    [[EXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EXT1]](s16)
+; GFX11-NEXT:    $vgpr1 = COPY [[EXT3]](s32)
+; GFX11-NEXT:    SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %val = load <2 x i1>, ptr addrspace(1) undef
+  ret <2 x i1> %val
+}
+
+define void @test_call_v2i1_func_void() {
+; GFX9-LABEL: name: test_call_v2i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:v2i1_func_void]]
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX9-NEXT:    [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+; GFX9-NEXT:    [[BUILDVEC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+; GFX9-NEXT:    [[TRUNC4:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILDVEC]](<2 x s16>)
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    G_STORE [[TRUNC4]](<2 x s1>), [[DEF]](p1) :: (volatile store (<2 x s1>) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_v2i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:v2i1_func_void]]
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit-def $vgpr0, implicit-def $vgpr1
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:    [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:    [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+; GFX11-NEXT:    [[BUILDVEC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+; GFX11-NEXT:    [[TRUNC4:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILDVEC]](<2 x s16>)
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    G_STORE [[TRUNC4]](<2 x s1>), [[DEF]](p1) :: (volatile store (<2 x s1>) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+
+  %val = call <2 x i1> @v2i1_func_void()
+  store volatile <2 x i1> %val, ptr addrspace(1) undef
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index 8fdd512a1c61a..7739a705fff1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -232,6 +232,87 @@ define void @test_call_void_func_a2i1() {
   ret void
 }
 
+define void @void_func_v2i1(<2 x i1> %arg0) {
+; GFX9-LABEL: name: void_func_v2i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $vgpr0, $vgpr1
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+; GFX9-NEXT:    [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX9-NEXT:    [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+; GFX9-NEXT:    [[BUILDVEC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILDVEC]](<2 x s16>)
+; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX9-NEXT:    G_STORE [[TRUNC2]](<2 x s1>), [[DEF]](p1) :: (store (<2 x s1>) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: void_func_v2i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $vgpr0, $vgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:    [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:    [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+; GFX11-NEXT:    [[BUILDVEC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(<2 x s1>) = G_TRUNC [[BUILDVEC]](<2 x s16>)
+; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
+; GFX11-NEXT:    G_STORE [[TRUNC2]](<2 x s1>), [[DEF]](p1) :: (store (<2 x s1>) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT:    SI_RETURN
+  store <2 x i1> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @test_call_void_func_v2i1(ptr addrspace(1) %in) {
+; GFX9-LABEL: name: test_call_void_func_v2i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT:    liveins: $vgpr0, $vgpr1
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT:    [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX9-NEXT:    [[MERGE:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+; GFX9-NEXT:    [[LOAD:%[0-9]+]]:_(<2 x s1>) = G_LOAD [[MERGE]](p1) :: (load (<2 x s1>) from %ir.in, addrspace 1)
+; GFX9-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_v2i1]]
+; GFX9-NEXT:    [[UNMERGE:%[0-9]+]]:_(s1), [[UNMERGE1:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[LOAD]](<2 x s1>)
+; GFX9-NEXT:    [[EXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UNMERGE]](s1)
+; GFX9-NEXT:    [[EXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UNMERGE1]](s1)
+; GFX9-NEXT:    [[EXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EXT]](s16)
+; GFX9-NEXT:    $vgpr0 = COPY [[EXT2]](s32)
+; GFX9-NEXT:    [[EXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EXT1]](s16)
+; GFX9-NEXT:    $vgpr1 = COPY [[EXT3]](s32)
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
+; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT:    SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_v2i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT:    liveins: $vgpr0, $vgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:    [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:    [[MERGE:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+; GFX11-NEXT:    [[LOAD:%[0-9]+]]:_(<2 x s1>) = G_LOAD [[MERGE]](p1) :: (load (<2 x s1>) from %ir.in, addrspace 1)
+; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_v2i1]]
+; GFX11-NEXT:    [[UNMERGE:%[0-9]+]]:_(s1), [[UNMERGE1:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[LOAD]](<2 x s1>)
+; GFX11-NEXT:    [[EXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UNMERGE]](s1)
+; GFX11-NEXT:    [[EXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UNMERGE1]](s1)
+; GFX11-NEXT:    [[EXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EXT]](s16)
+; GFX11-NEXT:    $vgpr0 = COPY [[EXT2]](s32)
+; GFX11-NEXT:    [[EXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EXT1]](s16)
+; GFX11-NEXT:    $vgpr1 = COPY [[EXT3]](s32)
+; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $vgpr0, implicit $vgpr1
+; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT:    SI_RETURN
+  %a = load <2 x i1>, ptr addrspace(1) %in
+  call void @void_func_v2i1(<2 x i1> %a)
+  ret void
+}
+
 define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
 ; GFX9-LABEL: name: void_func_i1_i1
 ; GFX9: bb.1 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/allow-check.ll b/llvm/test/CodeGen/AMDGPU/allow-check.ll
index d4f5621ce26a4..e47e120b6d381 100644
--- a/llvm/test/CodeGen/AMDGPU/allow-check.ll
+++ b/llvm/test/CodeGen/AMDGPU/allow-check.ll
@@ -1,14 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=0 | FileCheck %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=1 -fast-isel=0 | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=1 -fast-isel=0 | FileCheck -check-prefixes=GISEL %s
 ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=1 | FileCheck %s
 
 define i1 @test_runtime() local_unnamed_addr {
 ; CHECK-LABEL: test_runtime:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_runtime:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[4:5], 1
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %allow = call i1 @llvm.allow.runtime.check(metadata !"test_check")
   ret i1 %allow
@@ -20,8 +26,14 @@ define i1 @test_ubsan() local_unnamed_addr {
 ; CHECK-LABEL: test_ubsan:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_ubsan:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[4:5], 1
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %allow = call i1 @llvm.allow.ubsan.check(i8 7)
   ret i1 %allow
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index b8758a72998e2..82f9970a1674b 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -33540,28 +33540,28 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_select_fneg_lhs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_lhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_lhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -33592,28 +33592,28 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_select_fneg_rhs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_rhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_rhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
index c91f8cd889c88..8cea65d2c50e4 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -403,6 +403,173 @@ define void @test_call_inreg_i1_func_void() {
   ret void
 }
 
+define signext inreg i1 @signext_inreg_i1_func_void() {
+; GFX9-LABEL: signext_inreg_i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: signext_inreg_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_signext_inreg_i1_func_void() {
+; GFX9-LABEL: test_call_signext_inreg_i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, signext_inreg_i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, signext_inreg_i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_signext_inreg_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, signext_inreg_i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, signext_inreg_i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = call i1 @signext_inreg_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define zeroext inreg i1 @zeroext_inreg_i1_func_void() {
+; GFX9-LABEL: zeroext_inreg_i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: zeroext_inreg_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_zeroext_inreg_i1_func_void() {
+; GFX9-LABEL: test_call_zeroext_inreg_i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, zeroext_inreg_i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, zeroext_inreg_i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_zeroext_inreg_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, zeroext_inreg_i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, zeroext_inreg_i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = call i1 @zeroext_inreg_i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
 define [2 x i1] @a2i1_func_void() {
 ; GFX9-LABEL: a2i1_func_void:
 ; GFX9:       ; %bb.0:
@@ -518,3 +685,341 @@ define void @test_call_a2i1_func_void() {
   ret void
 }
 
+define [16 x i1] @a16i1_func_void(ptr addrspace(1) %in) {
+; GFX9-LABEL: a16i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
+; GFX9-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
+; GFX9-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
+; GFX9-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
+; GFX9-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
+; GFX9-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
+; GFX9-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
+; GFX9-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
+; GFX9-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
+; GFX9-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
+; GFX9-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
+; GFX9-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
+; GFX9-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
+; GFX9-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
+; GFX9-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_ubyte v1, v[1:2], off
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v9
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v10
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v11
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v12
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v13
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_and_b32_e32 v13, 1, v14
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_and_b32_e32 v14, 1, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_and_b32_e32 v15, 1, v16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v16, 1, v17
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
+; GFX9-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
+; GFX9-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
+; GFX9-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
+; GFX9-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
+; GFX9-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; GFX9-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
+; GFX9-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; GFX9-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; GFX9-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: a16i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    global_load_u8 v2, v[0:1], off
+; GFX11-NEXT:    global_load_u8 v3, v[0:1], off offset:1
+; GFX11-NEXT:    global_load_u8 v4, v[0:1], off offset:2
+; GFX11-NEXT:    global_load_u8 v5, v[0:1], off offset:3
+; GFX11-NEXT:    global_load_u8 v6, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_u8 v7, v[0:1], off offset:5
+; GFX11-NEXT:    global_load_u8 v8, v[0:1], off offset:6
+; GFX11-NEXT:    global_load_u8 v9, v[0:1], off offset:7
+; GFX11-NEXT:    global_load_u8 v10, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_u8 v11, v[0:1], off offset:9
+; GFX11-NEXT:    global_load_u8 v12, v[0:1], off offset:10
+; GFX11-NEXT:    global_load_u8 v13, v[0:1], off offset:11
+; GFX11-NEXT:    global_load_u8 v14, v[0:1], off offset:12
+; GFX11-NEXT:    global_load_u8 v15, v[0:1], off offset:13
+; GFX11-NEXT:    global_load_u8 v16, v[0:1], off offset:14
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:15
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_and_b32_e32 v11, 1, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v13
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 1, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 1, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 1, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 1, v5
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 1, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 1, v7
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s7, 1, v8
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s8, 1, v9
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s9, 1, v10
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s10, 1, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s11, 1, v12
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s12, 1, v13
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s13, 1, v14
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s14, 1, v15
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s15, 1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load [16 x i1], ptr addrspace(1) %in
+  ret [16 x i1] %val
+}
+
+define void @test_call_a16i1_func_void(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GFX9-LABEL: test_call_a16i1_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, a16i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, a16i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v20, s30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v18, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-NEXT:    v_writelane_b32 v20, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v19, v3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:15
+; GFX9-NEXT:    buffer_load_ubyte v1, off, s[0:3], s33 offset:14
+; GFX9-NEXT:    buffer_load_ubyte v2, off, s[0:3], s33 offset:13
+; GFX9-NEXT:    buffer_load_ubyte v3, off, s[0:3], s33 offset:12
+; GFX9-NEXT:    buffer_load_ubyte v4, off, s[0:3], s33 offset:11
+; GFX9-NEXT:    buffer_load_ubyte v5, off, s[0:3], s33 offset:10
+; GFX9-NEXT:    buffer_load_ubyte v6, off, s[0:3], s33 offset:9
+; GFX9-NEXT:    buffer_load_ubyte v7, off, s[0:3], s33 offset:8
+; GFX9-NEXT:    buffer_load_ubyte v8, off, s[0:3], s33 offset:7
+; GFX9-NEXT:    buffer_load_ubyte v9, off, s[0:3], s33 offset:6
+; GFX9-NEXT:    buffer_load_ubyte v10, off, s[0:3], s33 offset:5
+; GFX9-NEXT:    buffer_load_ubyte v11, off, s[0:3], s33 offset:4
+; GFX9-NEXT:    buffer_load_ubyte v12, off, s[0:3], s33 offset:3
+; GFX9-NEXT:    buffer_load_ubyte v13, off, s[0:3], s33 offset:2
+; GFX9-NEXT:    buffer_load_ubyte v14, off, s[0:3], s33 offset:1
+; GFX9-NEXT:    buffer_load_ubyte v15, off, s[0:3], s33
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX9-NEXT:    global_store_byte v[18:19], v0, off offset:15
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v1, off offset:14
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v2, off offset:13
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v3, off offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v4, off offset:11
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v5, off offset:10
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v6, off offset:9
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v7, off offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v8, off offset:7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v9, off offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v10, off offset:5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v11, off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v12, off offset:3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v13, off offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v14, off offset:1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[18:19], v15, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xf800
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_a16i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s16, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v19, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, a16i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, a16i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v19, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v17, v2
+; GFX11-NEXT:    v_writelane_b32 v19, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX11-NEXT:    global_store_b8 v[17:18], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:1 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:2 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:3 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v5, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s9
+; GFX11-NEXT:    global_store_b8 v[17:18], v1, off offset:5 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:6 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:7 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:8 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v5, off offset:9 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s15
+; GFX11-NEXT:    global_store_b8 v[17:18], v1, off offset:10 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:11 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:12 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:13 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v5, off offset:14 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[17:18], v0, off offset:15 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v19, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v19, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v19, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = call [16 x i1] @a16i1_func_void(ptr addrspace(1) %in)
+  store volatile [16 x i1] %val, ptr addrspace(1) %out
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
index caf0879671d85..c79f7285b0f2b 100644
--- a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -394,7 +394,6 @@ define void @i1_arg_i1_use(i1 %arg) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB8_2: ; %bb2
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_arg_i1_use:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index d7773f746c6a6..c630608ba418c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -5469,11 +5469,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX7LESS-NEXT:  .LBB9_3:
@@ -5506,12 +5505,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -5519,11 +5518,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s33
-; GFX9-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
@@ -5536,11 +5535,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX9-NEXT:  .LBB9_3:
@@ -5573,12 +5571,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1064-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
@@ -5586,29 +5584,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1064-NEXT:  .LBB9_3:
@@ -5640,12 +5638,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1032-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
@@ -5653,29 +5651,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1032-NEXT:  .LBB9_3:
@@ -5704,15 +5702,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1164-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -5720,29 +5717,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b32 s12, s33
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1164-NEXT:  .LBB9_3:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
@@ -5766,40 +5761,38 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1132-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b32 s12, s33
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1132-NEXT:  .LBB9_3:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
@@ -5829,12 +5822,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX9-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5842,11 +5835,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -5859,11 +5852,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX9-DPP-NEXT:  .LBB9_3:
@@ -5896,12 +5888,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1064-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5909,29 +5901,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1064-DPP-NEXT:  .LBB9_3:
@@ -5963,12 +5955,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1032-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5976,29 +5968,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1032-DPP-NEXT:  .LBB9_3:
@@ -6027,15 +6019,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6043,29 +6034,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1164-DPP-NEXT:  .LBB9_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
@@ -6089,40 +6078,38 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1132-DPP-NEXT:  .LBB9_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
   ret void
@@ -6205,11 +6192,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[42:43], s[0:1], s[42:43]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6251,14 +6237,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX9-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX9-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -6266,11 +6252,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s12, s41
 ; GFX9-NEXT:    s_mov_b32 s13, s40
@@ -6287,11 +6273,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6333,26 +6318,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1064-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1064-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1064-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
@@ -6363,18 +6348,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b32 s13, s40
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6416,26 +6401,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1032-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1032-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
@@ -6446,18 +6431,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_mov_b32 s13, s40
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6489,17 +6474,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1164-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1164-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1164-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1164-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -6507,7 +6491,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
@@ -6517,19 +6501,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_mov_b32 s13, s40
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6562,20 +6545,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1132-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1132-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
@@ -6585,17 +6568,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_mov_b32 s13, s40
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6638,14 +6621,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX9-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX9-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6653,11 +6636,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
@@ -6674,11 +6657,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6720,26 +6702,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1064-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1064-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -6750,18 +6732,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6803,26 +6785,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1032-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -6833,18 +6815,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6876,17 +6858,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1164-DPP-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6894,7 +6875,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -6904,19 +6885,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6949,20 +6929,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-DPP-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -6972,17 +6952,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9428,11 +9408,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX7LESS-NEXT:  .LBB16_3:
@@ -9468,12 +9447,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -9481,11 +9460,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s33
-; GFX9-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
@@ -9498,11 +9477,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-NEXT:  .LBB16_3:
@@ -9536,12 +9514,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1064-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
@@ -9549,29 +9527,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-NEXT:  .LBB16_3:
@@ -9604,12 +9582,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1032-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
@@ -9617,29 +9595,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-NEXT:  .LBB16_3:
@@ -9674,15 +9652,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1164-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -9690,29 +9667,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b32 s12, s33
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-NEXT:  .LBB16_3:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
@@ -9742,40 +9717,38 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1132-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b32 s12, s33
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-NEXT:  .LBB16_3:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
@@ -9808,12 +9781,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX9-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -9821,11 +9794,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -9838,11 +9811,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-DPP-NEXT:  .LBB16_3:
@@ -9876,12 +9848,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1064-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -9889,29 +9861,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-DPP-NEXT:  .LBB16_3:
@@ -9944,12 +9916,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1032-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -9957,29 +9929,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-DPP-NEXT:  .LBB16_3:
@@ -10014,15 +9986,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -10030,29 +10001,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-DPP-NEXT:  .LBB16_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
@@ -10082,40 +10051,38 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-DPP-NEXT:  .LBB16_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
   ret void
@@ -10198,11 +10165,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[42:43], s[0:1], s[42:43]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10244,14 +10210,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX9-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX9-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -10259,11 +10225,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s12, s41
 ; GFX9-NEXT:    s_mov_b32 s13, s40
@@ -10280,11 +10246,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10326,26 +10291,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1064-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1064-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1064-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
@@ -10356,18 +10321,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b32 s13, s40
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10409,26 +10374,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1032-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1032-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
@@ -10439,18 +10404,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_mov_b32 s13, s40
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10482,17 +10447,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1164-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1164-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1164-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1164-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -10500,7 +10464,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
@@ -10510,19 +10474,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_mov_b32 s13, s40
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10555,20 +10518,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1132-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1132-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
@@ -10578,17 +10541,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_mov_b32 s13, s40
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10631,14 +10594,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX9-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX9-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -10646,11 +10609,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
@@ -10667,11 +10630,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10713,26 +10675,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1064-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1064-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -10743,18 +10705,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10796,26 +10758,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1032-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -10826,18 +10788,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10869,17 +10831,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1164-DPP-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -10887,7 +10848,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -10897,19 +10858,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10942,20 +10902,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-DPP-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -10965,17 +10925,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 98c09dfaa2d5a..bdc9c6c42c225 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -3608,11 +3608,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX7LESS-NEXT:  .LBB6_3:
@@ -3641,12 +3640,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -3654,29 +3653,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s33
-; GFX9-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s36
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s37
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-NEXT:  .LBB6_3:
@@ -3705,12 +3703,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
@@ -3718,30 +3716,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1064-NEXT:  .LBB6_3:
@@ -3769,12 +3767,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
@@ -3782,30 +3780,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1032-NEXT:  .LBB6_3:
@@ -3829,15 +3827,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -3845,26 +3843,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b32 s12, s33
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-NEXT:  .LBB6_3:
@@ -3887,38 +3884,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b32 s12, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-NEXT:  .LBB6_3:
@@ -3948,12 +3944,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -3961,29 +3957,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX9-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-DPP-NEXT:  .LBB6_3:
@@ -4012,12 +4007,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -4025,30 +4020,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1064-DPP-NEXT:  .LBB6_3:
@@ -4076,12 +4071,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -4089,30 +4084,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1032-DPP-NEXT:  .LBB6_3:
@@ -4136,15 +4131,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -4152,26 +4147,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-DPP-NEXT:  .LBB6_3:
@@ -4194,38 +4188,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-DPP-NEXT:  .LBB6_3:
@@ -4311,11 +4304,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[42:43], s[0:1], s[42:43]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4395,9 +4387,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4478,9 +4469,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1064-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4561,9 +4552,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_clause 0x1
 ; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1032-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4631,10 +4622,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1164-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4699,10 +4689,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4783,9 +4772,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX9-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4866,9 +4854,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_clause 0x1
 ; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1064-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4949,9 +4937,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_clause 0x1
 ; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1032-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5019,10 +5007,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1164-DPP-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5087,10 +5074,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6017,11 +6003,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX7LESS-NEXT:  .LBB10_3:
@@ -6050,12 +6035,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -6063,29 +6048,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s33
-; GFX9-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s36
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s37
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-NEXT:  .LBB10_3:
@@ -6114,12 +6098,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
@@ -6127,30 +6111,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-NEXT:  .LBB10_3:
@@ -6178,12 +6162,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
@@ -6191,30 +6175,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-NEXT:  .LBB10_3:
@@ -6238,15 +6222,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -6254,26 +6238,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b32 s12, s33
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-NEXT:  .LBB10_3:
@@ -6296,38 +6279,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b32 s12, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-NEXT:  .LBB10_3:
@@ -6357,12 +6339,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6370,29 +6352,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX9-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-DPP-NEXT:  .LBB10_3:
@@ -6421,12 +6402,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6434,30 +6415,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-DPP-NEXT:  .LBB10_3:
@@ -6485,12 +6466,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6498,30 +6479,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-DPP-NEXT:  .LBB10_3:
@@ -6545,15 +6526,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6561,26 +6542,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-DPP-NEXT:  .LBB10_3:
@@ -6603,38 +6583,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], 4.0
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-DPP-NEXT:  .LBB10_3:
@@ -6720,11 +6699,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[42:43], s[0:1], s[42:43]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6804,9 +6782,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6887,9 +6864,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1064-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6970,9 +6947,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_clause 0x1
 ; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1032-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7040,10 +7017,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1164-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7108,10 +7084,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7192,9 +7167,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX9-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7275,9 +7249,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_clause 0x1
 ; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1064-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7358,9 +7332,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_clause 0x1
 ; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1032-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7428,10 +7402,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1164-DPP-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7496,10 +7469,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 1fb0db0e1f0d3..40d88ea46f1ac 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -3608,11 +3608,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX7LESS-NEXT:  .LBB6_3:
@@ -3641,12 +3640,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -3654,29 +3653,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s33
-; GFX9-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s36
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s37
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-NEXT:  .LBB6_3:
@@ -3705,12 +3703,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
@@ -3718,30 +3716,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1064-NEXT:  .LBB6_3:
@@ -3769,12 +3767,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
@@ -3782,30 +3780,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1032-NEXT:  .LBB6_3:
@@ -3829,15 +3827,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -3845,26 +3843,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b32 s12, s33
-; GFX1164-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-NEXT:  .LBB6_3:
@@ -3887,38 +3884,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b32 s12, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-NEXT:  .LBB6_3:
@@ -3948,12 +3944,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -3961,29 +3957,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX9-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-DPP-NEXT:  .LBB6_3:
@@ -4012,12 +4007,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -4025,30 +4020,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1064-DPP-NEXT:  .LBB6_3:
@@ -4076,12 +4071,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -4089,30 +4084,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1032-DPP-NEXT:  .LBB6_3:
@@ -4136,15 +4131,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -4152,26 +4147,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-DPP-NEXT:  .LBB6_3:
@@ -4194,38 +4188,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-DPP-NEXT:  .LBB6_3:
@@ -4311,11 +4304,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[42:43], s[0:1], s[42:43]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4395,9 +4387,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4478,9 +4469,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1064-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4561,9 +4552,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_clause 0x1
 ; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1032-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4631,10 +4622,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1164-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4699,10 +4689,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4783,9 +4772,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX9-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4866,9 +4854,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_clause 0x1
 ; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1064-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4949,9 +4937,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_clause 0x1
 ; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1032-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5019,10 +5007,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1164-DPP-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5087,10 +5074,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6017,11 +6003,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX7LESS-NEXT:  .LBB10_3:
@@ -6050,12 +6035,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -6063,29 +6048,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s33
-; GFX9-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s36
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s37
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-NEXT:  .LBB10_3:
@@ -6114,12 +6098,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
@@ -6127,30 +6111,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-NEXT:  .LBB10_3:
@@ -6178,12 +6162,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
@@ -6191,30 +6175,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-NEXT:  .LBB10_3:
@@ -6238,15 +6222,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -6254,26 +6238,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b32 s12, s33
-; GFX1164-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-NEXT:  .LBB10_3:
@@ -6296,38 +6279,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b32 s12, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-NEXT:  .LBB10_3:
@@ -6357,12 +6339,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6370,29 +6352,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX9-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-DPP-NEXT:  .LBB10_3:
@@ -6421,12 +6402,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6434,30 +6415,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-DPP-NEXT:  .LBB10_3:
@@ -6485,12 +6466,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6498,30 +6479,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-DPP-NEXT:  .LBB10_3:
@@ -6545,15 +6526,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6561,26 +6542,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-DPP-NEXT:  .LBB10_3:
@@ -6603,38 +6583,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[36:37], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-DPP-NEXT:  .LBB10_3:
@@ -6720,11 +6699,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[42:43], s[0:1], s[42:43]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6804,9 +6782,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6887,9 +6864,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1064-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6970,9 +6947,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_clause 0x1
 ; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1032-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7040,10 +7017,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1164-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7108,10 +7084,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7192,9 +7167,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX9-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7275,9 +7249,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_clause 0x1
 ; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1064-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7358,9 +7332,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_clause 0x1
 ; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
 ; GFX1032-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7428,10 +7402,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1164-DPP-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7496,10 +7469,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[2:3], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index c5f7980d1e3a9..ec170bbaa2a76 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -5677,11 +5677,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX7LESS-NEXT:  .LBB9_3:
@@ -5714,12 +5713,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -5727,11 +5726,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s33
-; GFX9-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
@@ -5744,11 +5743,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX9-NEXT:  .LBB9_3:
@@ -5781,12 +5779,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1064-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
@@ -5794,29 +5792,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1064-NEXT:  .LBB9_3:
@@ -5848,12 +5846,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1032-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
@@ -5861,29 +5859,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1032-NEXT:  .LBB9_3:
@@ -5912,15 +5910,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1164-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -5928,29 +5925,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b32 s12, s33
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1164-NEXT:  .LBB9_3:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
@@ -5974,40 +5969,38 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1132-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b32 s12, s33
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1132-NEXT:  .LBB9_3:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
@@ -6037,12 +6030,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX9-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6050,11 +6043,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -6067,11 +6060,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX9-DPP-NEXT:  .LBB9_3:
@@ -6104,12 +6096,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1064-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6117,29 +6109,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1064-DPP-NEXT:  .LBB9_3:
@@ -6171,12 +6163,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1032-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6184,29 +6176,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1032-DPP-NEXT:  .LBB9_3:
@@ -6235,15 +6227,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6251,29 +6242,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1164-DPP-NEXT:  .LBB9_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
@@ -6297,40 +6286,38 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1132-DPP-NEXT:  .LBB9_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
   ret void
@@ -6413,11 +6400,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[42:43], s[0:1], s[42:43]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6459,14 +6445,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX9-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX9-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -6474,11 +6460,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s12, s41
 ; GFX9-NEXT:    s_mov_b32 s13, s40
@@ -6495,11 +6481,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6541,26 +6526,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1064-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1064-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1064-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
@@ -6571,18 +6556,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b32 s13, s40
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6624,26 +6609,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1032-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1032-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
@@ -6654,18 +6639,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_mov_b32 s13, s40
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6697,17 +6682,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1164-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1164-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1164-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1164-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -6715,7 +6699,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
@@ -6725,19 +6709,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_mov_b32 s13, s40
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6770,20 +6753,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1132-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1132-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
@@ -6793,17 +6776,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_mov_b32 s13, s40
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6846,14 +6829,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX9-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX9-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -6861,11 +6844,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
@@ -6882,11 +6865,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6928,26 +6910,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1064-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1064-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -6958,18 +6940,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7011,26 +6993,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1032-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -7041,18 +7023,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7084,17 +7066,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1164-DPP-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -7102,7 +7083,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -7112,19 +7093,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7157,20 +7137,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-DPP-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -7180,17 +7160,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9634,11 +9614,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX7LESS-NEXT:  .LBB16_3:
@@ -9674,12 +9653,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -9687,11 +9666,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s33
-; GFX9-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
@@ -9704,11 +9683,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-NEXT:  .LBB16_3:
@@ -9742,12 +9720,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1064-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
@@ -9755,29 +9733,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-NEXT:  .LBB16_3:
@@ -9810,12 +9788,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1032-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
@@ -9823,29 +9801,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-NEXT:  .LBB16_3:
@@ -9880,15 +9858,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1164-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -9896,29 +9873,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-NEXT:    s_mov_b32 s12, s33
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-NEXT:  .LBB16_3:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
@@ -9948,40 +9923,38 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1132-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-NEXT:    s_mov_b32 s12, s33
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-NEXT:  .LBB16_3:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
@@ -10014,12 +9987,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX9-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX9-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -10027,11 +10000,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s33
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -10044,11 +10017,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-DPP-NEXT:  .LBB16_3:
@@ -10082,12 +10054,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1064-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1064-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -10095,29 +10067,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-DPP-NEXT:  .LBB16_3:
@@ -10150,12 +10122,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[36:37], 0x0
 ; GFX1032-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1032-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -10163,29 +10135,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s38, s0, s38
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s38
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-DPP-NEXT:  .LBB16_3:
@@ -10220,15 +10192,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -10236,29 +10207,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[38:39], s[0:1], s[38:39]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[38:39]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-DPP-NEXT:  .LBB16_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
@@ -10288,40 +10257,38 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s33
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v3, s37
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s38, s0, s38
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s38
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-DPP-NEXT:  .LBB16_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
   ret void
@@ -10404,11 +10371,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
 ; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT:    s_or_b64 s[42:43], s[0:1], s[42:43]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10450,14 +10416,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX9-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX9-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
@@ -10465,11 +10431,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s12, s41
 ; GFX9-NEXT:    s_mov_b32 s13, s40
@@ -10486,11 +10452,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10532,26 +10497,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1064-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1064-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1064-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
@@ -10562,18 +10527,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b32 s13, s40
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10615,26 +10580,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1032-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1032-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
@@ -10645,18 +10610,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_mov_b32 s13, s40
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10688,17 +10653,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1164-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1164-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1164-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1164-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
@@ -10706,7 +10670,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
@@ -10716,19 +10680,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_mov_b32 s13, s40
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10761,20 +10724,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1132-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1132-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
@@ -10784,17 +10747,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_mov_b32 s13, s40
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10837,14 +10800,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX9-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX9-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -10852,11 +10815,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
@@ -10873,11 +10836,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX9-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10919,26 +10881,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1064-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1064-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -10949,18 +10911,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
+; GFX1064-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11002,26 +10964,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v43, s[42:43]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[0:1], v43, s[42:43]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1032-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -11032,18 +10994,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_or_b32 s44, s0, s44
+; GFX1032-DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11075,17 +11037,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v1
+; GFX1164-DPP-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -11093,7 +11054,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -11103,19 +11064,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], s[0:1], s[44:45]
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11148,20 +11108,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-DPP-NEXT:    global_load_b64 v[0:1], v43, s[42:43]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v42
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
@@ -11171,17 +11131,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[0:1], off
+; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[2:3], off offset:8
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_or_b32 s44, s0, s44
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 13372dd94619b..75ea61cbe268e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -466,7 +466,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -489,7 +488,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX8-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -509,7 +507,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -529,7 +526,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -550,7 +546,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:  .LBB4_3: ; %ret
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -573,7 +568,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:  .LBB4_3: ; %ret
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]

>From 6aaa564109f9bf740a43d74f240ab5d973be41a1 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 27 May 2024 19:33:20 -0500
Subject: [PATCH 22/25] Overload the function
 CallLowering::determineAndHandleAssignments() with a CCState argument.

---
 .../llvm/CodeGen/GlobalISel/CallLowering.h    |  6 ++
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp  | 13 +++++
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 55 +++++++------------
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h   |  7 ---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  4 +-
 5 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 4c187a3068d82..148b187a1ee55 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -407,6 +407,12 @@ class CallLowering {
       CallingConv::ID CallConv, bool IsVarArg,
       ArrayRef<Register> ThisReturnRegs = std::nullopt) const;
 
+  bool determineAndHandleAssignments(
+      ValueHandler &Handler, ValueAssigner &Assigner,
+      SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
+      CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
+      ArrayRef<Register> ThisReturnRegs = std::nullopt) const;
+
   /// Use \p Handler to insert code to handle the argument/return values
   /// represented by \p Args. It's expected determineAssignments previously
   /// processed these arguments to populate \p CCState and \p ArgLocs.
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 363fad53b76c3..e363436ac7739 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -628,6 +628,19 @@ bool CallLowering::determineAndHandleAssignments(
                            ThisReturnRegs);
 }
 
+bool CallLowering::determineAndHandleAssignments(
+    ValueHandler &Handler, ValueAssigner &Assigner,
+    SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
+    CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
+    ArrayRef<Register> ThisReturnRegs) const {
+
+  if (!determineAssignments(Assigner, Args, CCInfo))
+    return false;
+
+  return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder,
+                           ThisReturnRegs);
+}
+
 static unsigned extendOpFromFlags(llvm::ISD::ArgFlagsTy Flags) {
   if (Flags.isSExt())
     return TargetOpcode::G_SEXT;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 9217370193156..65cdca5b6e1e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -317,35 +317,6 @@ bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
   return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
 }
 
-/// Replace CallLowering::determineAndHandleAssignments() because we need to
-/// reserve ScratchRSrcReg when necessary.
-/// TODO: Investigate if reserving ScratchRSrcReg can be moved to calling conv
-/// functions. If so, then this function is not needed anymore -- we can just
-/// use CallLowering::determineAndHandleAssignments() as before.
-bool AMDGPUCallLowering::determineAndHandleAssignmentsLocal(
-    ValueHandler &Handler, ValueAssigner &Assigner,
-    SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
-    CallingConv::ID CallConv, bool IsVarArg) const {
-
-  MachineFunction &MF = MIRBuilder.getMF();
-  const Function &F = MF.getFunction();
-
-  SmallVector<CCValAssign, 16> ArgLocs;
-
-  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, F.getContext());
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  if (!ST.enableFlatScratch()) {
-    SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
-  }
-
-  if (!determineAssignments(Assigner, Args, CCInfo))
-    return false;
-
-  return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder);
-}
-
 /// Lower the return value for the already existing \p Ret. This assumes that
 /// \p B's insertion point is correct.
 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
@@ -409,8 +380,16 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
   OutgoingValueAssigner Assigner(AssignFn);
   AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
 
-  return determineAndHandleAssignmentsLocal(RetHandler, Assigner, SplitRetInfos,
-                                            B, CC, F.isVarArg());
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.enableFlatScratch()) {
+    SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+  }
+  return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
+                                       CCInfo, ArgLocs);
 }
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
@@ -1575,9 +1554,17 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                                       Info.IsVarArg);
     IncomingValueAssigner Assigner(RetAssignFn);
     CallReturnHandler Handler(MIRBuilder, MRI, MIB);
-    if (!determineAndHandleAssignmentsLocal(Handler, Assigner, InArgs,
-                                            MIRBuilder, Info.CallConv,
-                                            Info.IsVarArg))
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
+
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    if (!ST.enableFlatScratch()) {
+      SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+      CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+    }
+    if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
+                                       CCInfo, ArgLocs))
       return false;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index f9b8599e8ce8c..a6e801f2a547b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,13 +37,6 @@ class AMDGPUCallLowering final : public CallLowering {
   bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
                       ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
 
-  bool determineAndHandleAssignmentsLocal(ValueHandler &Handler,
-                                          ValueAssigner &Assigner,
-                                          SmallVectorImpl<ArgInfo> &Args,
-                                          MachineIRBuilder &MIRBuilder,
-                                          CallingConv::ID CallConv,
-                                          bool IsVarArg) const;
-
 public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d98045f422878..543a6bebc45ba 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16039,9 +16039,7 @@ static bool isCopyFromRegForI1Return(const SDNode *N) {
     N3 = N3->getOperand(0).getNode();
   } while (N3->getOpcode() == ISD::CopyFromReg);
 
-  if (N3->getOpcode() != ISD::CALLSEQ_END)
-    return false;
-  return true;
+  return N3->getOpcode() == ISD::CALLSEQ_END;
 }
 
 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,

>From 4892dc721357eb7ec110767cef3389e60ca55a62 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 27 May 2024 22:30:28 -0500
Subject: [PATCH 23/25] For i1 arg, set reg bank in RegBankSelect, instead of
 setting reg class in IRTranslator.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |   4 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   7 +
 .../GlobalISel/function-call-i1-return.ll     |  20 +-
 .../AMDGPU/GlobalISel/function-i1-args.ll     | 224 +++++++++---------
 .../irtranslator-call-return-values.ll        |   6 +-
 .../GlobalISel/irtranslator-function-args.ll  |  12 +-
 .../GlobalISel/irtranslator-invariant.ll      |   2 +-
 7 files changed, 139 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 65cdca5b6e1e7..6d4b81df13a05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -129,10 +129,6 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
 
     if (VA.getLocVT() == MVT::i1) {
       MIRBuilder.buildCopy(ValVReg, PhysReg);
-      MRI.setRegClass(ValVReg, MIRBuilder.getMF()
-                                   .getSubtarget<GCNSubtarget>()
-                                   .getRegisterInfo()
-                                   ->getBoolRC());
       return;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index aa44cca11f800..7bf98c08b0e4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3741,6 +3741,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     if (!DstBank)
       DstBank = SrcBank;
 
+    // For i1 function arguments, the call of getRegBank() currently gives
+    // incorrect result. We set both src and dst banks to VCCRegBank.
+    if (!MI.getOperand(1).getReg().isVirtual() &&
+        MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(1)) {
+      DstBank = SrcBank = &AMDGPU::VCCRegBank;
+    }
+
     // For i1 return value, the dst reg is an SReg but we need to set the reg
     // bank to VCCRegBank.
     if (!MI.getOperand(0).getReg().isVirtual() &&
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 679eb28d4a04c..244e9a4b5c56c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -28,7 +28,7 @@ define void @test_call_i1_func_void() {
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
 ; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
@@ -39,7 +39,7 @@ define void @test_call_i1_func_void() {
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
@@ -75,7 +75,7 @@ define void @test_call_zeroext_i1_func_void() {
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
 ; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
@@ -86,7 +86,7 @@ define void @test_call_zeroext_i1_func_void() {
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
@@ -122,7 +122,7 @@ define void @test_call_signext_i1_func_void() {
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
 ; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
@@ -133,7 +133,7 @@ define void @test_call_signext_i1_func_void() {
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
@@ -334,8 +334,8 @@ define void @test_call_a2i1_func_void() {
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
 ; GFX9-NEXT:    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
 ; GFX9-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5, implicit-def $sgpr6_sgpr7
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:_(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
@@ -349,8 +349,8 @@ define void @test_call_a2i1_func_void() {
 ; GFX11-NEXT:    ADJCALLSTACKUP 0, 0, implicit-def $scc
 ; GFX11-NEXT:    [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a2i1_func_void
 ; GFX11-NEXT:    $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit-def $sgpr0, implicit-def $sgpr1
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr1
 ; GFX11-NEXT:    ADJCALLSTACKDOWN 0, 0, implicit-def $scc
 ; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index 7739a705fff1c..89dd168c5270b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -6,7 +6,7 @@ define void @void_func_i1(i1 %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    SI_RETURN
@@ -15,7 +15,7 @@ define void @void_func_i1(i1 %arg0) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:   liveins: $sgpr0
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    SI_RETURN
@@ -57,7 +57,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX9-NEXT:    [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
@@ -69,7 +69,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX11-NEXT:    [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
@@ -116,7 +116,7 @@ define void @void_func_i1_signext(i1 signext %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX9-NEXT:    [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
@@ -128,7 +128,7 @@ define void @void_func_i1_signext(i1 signext %arg0) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
 ; GFX11-NEXT:    [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
@@ -175,8 +175,8 @@ define void @void_func_a2i1([2 x i1] %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
@@ -188,8 +188,8 @@ define void @void_func_a2i1([2 x i1] %arg0) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr1
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
@@ -318,8 +318,8 @@ define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX9-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -329,8 +329,8 @@ define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr1
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX11-NEXT:    G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -382,19 +382,19 @@ define void @exhaust_sgprs_by_i1_args(
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[COPY1:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr8_sgpr9
-; GFX9-NEXT:    [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr10_sgpr11
-; GFX9-NEXT:    [[COPY4:%[0-9]+]]:sreg_64(s1) = COPY $sgpr12_sgpr13
-; GFX9-NEXT:    [[COPY5:%[0-9]+]]:sreg_64(s1) = COPY $sgpr14_sgpr15
-; GFX9-NEXT:    [[COPY6:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
-; GFX9-NEXT:    [[COPY7:%[0-9]+]]:sreg_64(s1) = COPY $sgpr18_sgpr19
-; GFX9-NEXT:    [[COPY8:%[0-9]+]]:sreg_64(s1) = COPY $sgpr20_sgpr21
-; GFX9-NEXT:    [[COPY9:%[0-9]+]]:sreg_64(s1) = COPY $sgpr22_sgpr23
-; GFX9-NEXT:    [[COPY10:%[0-9]+]]:sreg_64(s1) = COPY $sgpr24_sgpr25
-; GFX9-NEXT:    [[COPY11:%[0-9]+]]:sreg_64(s1) = COPY $sgpr26_sgpr27
-; GFX9-NEXT:    [[COPY12:%[0-9]+]]:sreg_64(s1) = COPY $sgpr28_sgpr29
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY1:%[0-9]+]]:_(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr8_sgpr9
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:_(s1) = COPY $sgpr10_sgpr11
+; GFX9-NEXT:    [[COPY4:%[0-9]+]]:_(s1) = COPY $sgpr12_sgpr13
+; GFX9-NEXT:    [[COPY5:%[0-9]+]]:_(s1) = COPY $sgpr14_sgpr15
+; GFX9-NEXT:    [[COPY6:%[0-9]+]]:_(s1) = COPY $sgpr16_sgpr17
+; GFX9-NEXT:    [[COPY7:%[0-9]+]]:_(s1) = COPY $sgpr18_sgpr19
+; GFX9-NEXT:    [[COPY8:%[0-9]+]]:_(s1) = COPY $sgpr20_sgpr21
+; GFX9-NEXT:    [[COPY9:%[0-9]+]]:_(s1) = COPY $sgpr22_sgpr23
+; GFX9-NEXT:    [[COPY10:%[0-9]+]]:_(s1) = COPY $sgpr24_sgpr25
+; GFX9-NEXT:    [[COPY11:%[0-9]+]]:_(s1) = COPY $sgpr26_sgpr27
+; GFX9-NEXT:    [[COPY12:%[0-9]+]]:_(s1) = COPY $sgpr28_sgpr29
 ; GFX9-NEXT:    [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; GFX9-NEXT:    [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
 ; GFX9-NEXT:    [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -446,36 +446,36 @@ define void @exhaust_sgprs_by_i1_args(
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
-; GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
-; GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr2
-; GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY $sgpr3
-; GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY $sgpr4
-; GFX11-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY $sgpr5
-; GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY $sgpr6
-; GFX11-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY $sgpr7
-; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY $sgpr8
-; GFX11-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY $sgpr9
-; GFX11-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY $sgpr10
-; GFX11-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY $sgpr11
-; GFX11-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY $sgpr12
-; GFX11-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY $sgpr13
-; GFX11-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY $sgpr14
-; GFX11-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY $sgpr15
-; GFX11-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY $sgpr16
-; GFX11-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY $sgpr17
-; GFX11-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY $sgpr18
-; GFX11-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY $sgpr19
-; GFX11-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY $sgpr20
-; GFX11-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY $sgpr21
-; GFX11-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY $sgpr22
-; GFX11-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY $sgpr23
-; GFX11-NEXT:   [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY $sgpr24
-; GFX11-NEXT:   [[COPY25:%[0-9]+]]:sreg_32(s1) = COPY $sgpr25
-; GFX11-NEXT:   [[COPY26:%[0-9]+]]:sreg_32(s1) = COPY $sgpr26
-; GFX11-NEXT:   [[COPY27:%[0-9]+]]:sreg_32(s1) = COPY $sgpr27
-; GFX11-NEXT:   [[COPY28:%[0-9]+]]:sreg_32(s1) = COPY $sgpr28
-; GFX11-NEXT:   [[COPY29:%[0-9]+]]:sreg_32(s1) = COPY $sgpr29
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s1) = COPY $sgpr1
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr2
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s1) = COPY $sgpr3
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s1) = COPY $sgpr4
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s1) = COPY $sgpr5
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s1) = COPY $sgpr6
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s1) = COPY $sgpr7
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s1) = COPY $sgpr8
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s1) = COPY $sgpr9
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s1) = COPY $sgpr10
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s1) = COPY $sgpr11
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s1) = COPY $sgpr12
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s1) = COPY $sgpr13
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s1) = COPY $sgpr14
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s1) = COPY $sgpr15
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s1) = COPY $sgpr16
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s1) = COPY $sgpr17
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s1) = COPY $sgpr18
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s1) = COPY $sgpr19
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s1) = COPY $sgpr20
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s1) = COPY $sgpr21
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s1) = COPY $sgpr22
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s1) = COPY $sgpr23
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s1) = COPY $sgpr24
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s1) = COPY $sgpr25
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s1) = COPY $sgpr26
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s1) = COPY $sgpr27
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s1) = COPY $sgpr28
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s1) = COPY $sgpr29
 ; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
 ; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -532,19 +532,19 @@ define void @void_func_a48i1([48 x i1] %arg0) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
-; GFX9-NEXT:    [[COPY1:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr8_sgpr9
-; GFX9-NEXT:    [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr10_sgpr11
-; GFX9-NEXT:    [[COPY4:%[0-9]+]]:sreg_64(s1) = COPY $sgpr12_sgpr13
-; GFX9-NEXT:    [[COPY5:%[0-9]+]]:sreg_64(s1) = COPY $sgpr14_sgpr15
-; GFX9-NEXT:    [[COPY6:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
-; GFX9-NEXT:    [[COPY7:%[0-9]+]]:sreg_64(s1) = COPY $sgpr18_sgpr19
-; GFX9-NEXT:    [[COPY8:%[0-9]+]]:sreg_64(s1) = COPY $sgpr20_sgpr21
-; GFX9-NEXT:    [[COPY9:%[0-9]+]]:sreg_64(s1) = COPY $sgpr22_sgpr23
-; GFX9-NEXT:    [[COPY10:%[0-9]+]]:sreg_64(s1) = COPY $sgpr24_sgpr25
-; GFX9-NEXT:    [[COPY11:%[0-9]+]]:sreg_64(s1) = COPY $sgpr26_sgpr27
-; GFX9-NEXT:    [[COPY12:%[0-9]+]]:sreg_64(s1) = COPY $sgpr28_sgpr29
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY1:%[0-9]+]]:_(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr8_sgpr9
+; GFX9-NEXT:    [[COPY3:%[0-9]+]]:_(s1) = COPY $sgpr10_sgpr11
+; GFX9-NEXT:    [[COPY4:%[0-9]+]]:_(s1) = COPY $sgpr12_sgpr13
+; GFX9-NEXT:    [[COPY5:%[0-9]+]]:_(s1) = COPY $sgpr14_sgpr15
+; GFX9-NEXT:    [[COPY6:%[0-9]+]]:_(s1) = COPY $sgpr16_sgpr17
+; GFX9-NEXT:    [[COPY7:%[0-9]+]]:_(s1) = COPY $sgpr18_sgpr19
+; GFX9-NEXT:    [[COPY8:%[0-9]+]]:_(s1) = COPY $sgpr20_sgpr21
+; GFX9-NEXT:    [[COPY9:%[0-9]+]]:_(s1) = COPY $sgpr22_sgpr23
+; GFX9-NEXT:    [[COPY10:%[0-9]+]]:_(s1) = COPY $sgpr24_sgpr25
+; GFX9-NEXT:    [[COPY11:%[0-9]+]]:_(s1) = COPY $sgpr26_sgpr27
+; GFX9-NEXT:    [[COPY12:%[0-9]+]]:_(s1) = COPY $sgpr28_sgpr29
 ; GFX9-NEXT:    [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; GFX9-NEXT:    [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
 ; GFX9-NEXT:    [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -640,36 +640,36 @@ define void @void_func_a64i1([64 x i1] %arg0) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
-; GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
-; GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr2
-; GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY $sgpr3
-; GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY $sgpr4
-; GFX11-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY $sgpr5
-; GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY $sgpr6
-; GFX11-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY $sgpr7
-; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY $sgpr8
-; GFX11-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY $sgpr9
-; GFX11-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY $sgpr10
-; GFX11-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY $sgpr11
-; GFX11-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY $sgpr12
-; GFX11-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY $sgpr13
-; GFX11-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY $sgpr14
-; GFX11-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY $sgpr15
-; GFX11-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY $sgpr16
-; GFX11-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY $sgpr17
-; GFX11-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY $sgpr18
-; GFX11-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY $sgpr19
-; GFX11-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY $sgpr20
-; GFX11-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY $sgpr21
-; GFX11-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY $sgpr22
-; GFX11-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY $sgpr23
-; GFX11-NEXT:   [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY $sgpr24
-; GFX11-NEXT:   [[COPY25:%[0-9]+]]:sreg_32(s1) = COPY $sgpr25
-; GFX11-NEXT:   [[COPY26:%[0-9]+]]:sreg_32(s1) = COPY $sgpr26
-; GFX11-NEXT:   [[COPY27:%[0-9]+]]:sreg_32(s1) = COPY $sgpr27
-; GFX11-NEXT:   [[COPY28:%[0-9]+]]:sreg_32(s1) = COPY $sgpr28
-; GFX11-NEXT:   [[COPY29:%[0-9]+]]:sreg_32(s1) = COPY $sgpr29
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s1) = COPY $sgpr1
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr2
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s1) = COPY $sgpr3
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s1) = COPY $sgpr4
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s1) = COPY $sgpr5
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s1) = COPY $sgpr6
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s1) = COPY $sgpr7
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s1) = COPY $sgpr8
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s1) = COPY $sgpr9
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s1) = COPY $sgpr10
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s1) = COPY $sgpr11
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s1) = COPY $sgpr12
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s1) = COPY $sgpr13
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s1) = COPY $sgpr14
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s1) = COPY $sgpr15
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s1) = COPY $sgpr16
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s1) = COPY $sgpr17
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s1) = COPY $sgpr18
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s1) = COPY $sgpr19
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s1) = COPY $sgpr20
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s1) = COPY $sgpr21
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s1) = COPY $sgpr22
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s1) = COPY $sgpr23
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s1) = COPY $sgpr24
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s1) = COPY $sgpr25
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s1) = COPY $sgpr26
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s1) = COPY $sgpr27
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s1) = COPY $sgpr28
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s1) = COPY $sgpr29
 ; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
 ; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -763,7 +763,7 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr6, $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
 ; GFX9-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
@@ -775,7 +775,7 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
 ; GFX11-NEXT:    [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
@@ -794,7 +794,7 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
 ; GFX9-NEXT: {{  $}}
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
 ; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -806,7 +806,7 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
 ; GFX11-NEXT: {{  $}}
 ; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
 ; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr1
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -821,7 +821,7 @@ define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr6, $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
 ; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
@@ -833,7 +833,7 @@ define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
 ; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
@@ -852,7 +852,7 @@ define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
 ; GFX9-NEXT: {{  $}}
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
 ; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -864,7 +864,7 @@ define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
 ; GFX11-NEXT: {{  $}}
 ; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
 ; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr1
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -879,7 +879,7 @@ define void @void_func_signext_i1_i1_inreg(i1 signext %arg0, i1 inreg %arg1) {
 ; GFX9: bb.1 (%ir-block.0):
 ; GFX9-NEXT:    liveins: $sgpr6, $sgpr4_sgpr5
 ; GFX9-NEXT: {{  $}}
-; GFX9-NEXT:    [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
 ; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
 ; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
@@ -891,7 +891,7 @@ define void @void_func_signext_i1_i1_inreg(i1 signext %arg0, i1 inreg %arg1) {
 ; GFX11: bb.1 (%ir-block.0):
 ; GFX11-NEXT:    liveins: $sgpr0, $sgpr1
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:    [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
 ; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
 ; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
@@ -910,7 +910,7 @@ define void @void_func_i1_inreg_signext_i1(i1 inreg %arg0, i1 signext %arg1) {
 ; GFX9-NEXT: {{  $}}
 ; GFX9-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
 ; GFX9-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX9-NEXT:    [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr6_sgpr7
 ; GFX9-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX9-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX9-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -922,7 +922,7 @@ define void @void_func_i1_inreg_signext_i1(i1 inreg %arg0, i1 signext %arg1) {
 ; GFX11-NEXT: {{  $}}
 ; GFX11-NEXT:    [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
 ; GFX11-NEXT:    [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:    [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT:    [[COPY2:%[0-9]+]]:_(s1) = COPY $sgpr1
 ; GFX11-NEXT:    [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF  
 ; GFX11-NEXT:    G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
 ; GFX11-NEXT:    G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index ec999149daed8..bb3090a771c60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -199,7 +199,7 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s1) = COPY $sgpr0_sgpr1
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[COPY21]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -275,7 +275,7 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s1) = COPY $sgpr0_sgpr1
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY21]](s1)
   ; GCN-NEXT:   G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
@@ -334,7 +334,7 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s1) = COPY $sgpr0_sgpr1
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY21]](s1)
   ; GCN-NEXT:   G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index f0ab1b25d6f03..7ed27241742f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -37,7 +37,7 @@ define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -46,7 +46,7 @@ define void @void_func_i1(i1 %arg0) #0 {
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+  ; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr0
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GFX11-NEXT:   SI_RETURN
@@ -59,7 +59,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
@@ -77,7 +77,7 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
@@ -96,7 +96,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -1993,7 +1993,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s1) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index aa6f518a3e30f..88500f90ad55d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -24,7 +24,7 @@ define i32 @load_select_const_i32_gv(i1 %cond) {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s1) = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
   ; CHECK-NEXT:   [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
   ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[COPY]](s1), [[GV]], [[GV1]]

>From da176a81ddbebe5675afb5367d67d45db2a27e90 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 5 Jun 2024 13:38:02 -0500
Subject: [PATCH 24/25] For i1 function return value, an additional CopyFromReg
 is created to ensure that the COPY instruction generated later on has sreg_64
 as the the destnation reg class instead of vreg_1. This commit fixes that
 CopyFromReg node such that it uses the physical register assigned to the
 return value instead of the new virtual register. The purpose of doing this
 is to avoid having to create the function isCopyFromRegForI1Return() that is
 used in SITargetLowering::isSDNodeSourceOfDivergence() to get around the
 assert therein.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 25 +------
 .../CodeGen/AMDGPU/function-call-i1-return.ll | 68 +++++++++----------
 2 files changed, 36 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 543a6bebc45ba..f2e7f649386ff 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3263,7 +3263,7 @@ SDValue SITargetLowering::LowerCallResult(
         if (TRI->isSGPRReg(MRI, VA.getLocReg())) {
           Register TmpVReg = MRI.createVirtualRegister(TRI->getBoolRC());
           SDValue TmpCopyTo = DAG.getCopyToReg(Chain, DL, TmpVReg, Val);
-          Val = DAG.getCopyFromReg(TmpCopyTo, DL, TmpVReg, MVT::i1);
+          Val = DAG.getCopyFromReg(TmpCopyTo, DL, VA.getLocReg(), MVT::i1);
         }
       }
     } else if (VA.isMemLoc()) {
@@ -16022,26 +16022,6 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
   return false;
 }
 
-LLVM_ATTRIBUTE_UNUSED
-static bool isCopyFromRegForI1Return(const SDNode *N) {
-  assert(N->getOpcode() == ISD::CopyFromReg);
-  SDNode *N1 = N->getOperand(0).getNode();
-  if (N1->getOpcode() != ISD::CopyToReg)
-    return false;
-  SDNode *N2 = N1->getOperand(0).getNode();
-  if (N2->getOpcode() != ISD::CopyFromReg)
-    return false;
-
-  // Possibly multiple CopyFromReg nodes before getting to CALLSEQ_END,
-  // e.g., when the return value is an array.
-  SDNode *N3 = N2;
-  do {
-    N3 = N3->getOperand(0).getNode();
-  } while (N3->getOpcode() == ISD::CopyFromReg);
-
-  return N3->getOpcode() == ISD::CALLSEQ_END;
-}
-
 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
                                                   FunctionLoweringInfo *FLI,
                                                   UniformityInfo *UA) const {
@@ -16059,8 +16039,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
     if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
       return UA->isDivergent(V);
 
-    assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N) ||
-           isCopyFromRegForI1Return(N));
+    assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
     return !TRI->isSGPRReg(MRI, Reg);
   }
   case ISD::LOAD: {
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
index 8cea65d2c50e4..2887de8aaea3c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -961,56 +961,56 @@ define void @test_call_a16i1_func_void(ptr addrspace(1) %in, ptr addrspace(1) %o
 ; GFX11-NEXT:    v_writelane_b32 v19, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX11-NEXT:    global_store_b8 v[17:18], v1, off dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX11-NEXT:    global_store_b8 v[17:18], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:1 dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    v_readlane_b32 s31, v19, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v19, 0
+; GFX11-NEXT:    global_store_b8 v[17:18], v0, off offset:1 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:2 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v1, off offset:2 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:3 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:3 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v5, off offset:4 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s6
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s7
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s9
-; GFX11-NEXT:    global_store_b8 v[17:18], v1, off offset:5 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:5 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:6 dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s10
+; GFX11-NEXT:    global_store_b8 v[17:18], v0, off offset:6 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:7 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v1, off offset:7 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:8 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v5, off offset:9 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:9 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s10
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s11
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s12
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s13
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s14
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s15
-; GFX11-NEXT:    global_store_b8 v[17:18], v1, off offset:10 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:10 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:11 dlc
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s15
+; GFX11-NEXT:    global_store_b8 v[17:18], v0, off offset:11 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:12 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v1, off offset:12 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:13 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v2, off offset:13 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v5, off offset:14 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v3, off offset:14 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[17:18], v0, off offset:15 dlc
+; GFX11-NEXT:    global_store_b8 v[17:18], v4, off offset:15 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v19, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v19, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v19, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0

>From df3a52e8f182b85b80d7037b2b68c394de267fd9 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 6 Jun 2024 16:49:35 -0500
Subject: [PATCH 25/25] Undo 6aaa564, i.e., remove the overloaded
 CallLowering::determineAndHandleAssignments(). Instead, call
 determineAssignments() and handleAssignments() separately.

---
 llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h |  6 ------
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp        | 13 -------------
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp       | 13 +++++++++----
 3 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 148b187a1ee55..4c187a3068d82 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -407,12 +407,6 @@ class CallLowering {
       CallingConv::ID CallConv, bool IsVarArg,
       ArrayRef<Register> ThisReturnRegs = std::nullopt) const;
 
-  bool determineAndHandleAssignments(
-      ValueHandler &Handler, ValueAssigner &Assigner,
-      SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
-      CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
-      ArrayRef<Register> ThisReturnRegs = std::nullopt) const;
-
   /// Use \p Handler to insert code to handle the argument/return values
   /// represented by \p Args. It's expected determineAssignments previously
   /// processed these arguments to populate \p CCState and \p ArgLocs.
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index e363436ac7739..363fad53b76c3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -628,19 +628,6 @@ bool CallLowering::determineAndHandleAssignments(
                            ThisReturnRegs);
 }
 
-bool CallLowering::determineAndHandleAssignments(
-    ValueHandler &Handler, ValueAssigner &Assigner,
-    SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
-    CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
-    ArrayRef<Register> ThisReturnRegs) const {
-
-  if (!determineAssignments(Assigner, Args, CCInfo))
-    return false;
-
-  return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder,
-                           ThisReturnRegs);
-}
-
 static unsigned extendOpFromFlags(llvm::ISD::ArgFlagsTy Flags) {
   if (Flags.isSExt())
     return TargetOpcode::G_SEXT;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 6d4b81df13a05..a7c4e0997ba54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -384,8 +384,10 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
     SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
     CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
   }
-  return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
-                                       CCInfo, ArgLocs);
+  if (!determineAssignments(Assigner, SplitRetInfos, CCInfo))
+    return false;
+
+  return handleAssignments(RetHandler, SplitRetInfos, CCInfo, ArgLocs, B);
 }
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
@@ -1559,8 +1561,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
       CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
     }
-    if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
-                                       CCInfo, ArgLocs))
+
+    if (!determineAssignments(Assigner, InArgs, CCInfo))
+      return false;
+
+    if (!handleAssignments(Handler, InArgs, CCInfo, ArgLocs, MIRBuilder))
       return false;
   }
 



More information about the llvm-commits mailing list