[llvm] [AMDGPU] Allocate i1 argument to SGPRs (PR #72461)

Jun Wang via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 2 14:09:20 PST 2024


https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/72461

>From 5ea84d08da4c3fd45fc191f98499f31b53776740 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 19:48:41 -0600
Subject: [PATCH 1/9] [AMDGPU] Allocate i1 argument to SGPRs

Currently i1 arguments are passed as 32-bit VGPRs. It would make more
sense to make use of SGPRs and pass these values as a wavesize bool mask.
---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td |  5 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   | 13 +++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      | 23 +++++++++
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp  |  6 +++
 llvm/test/CodeGen/AMDGPU/z_callee.ll        | 33 ++++++++++++
 llvm/test/CodeGen/AMDGPU/z_caller.ll        | 43 ++++++++++++++++
 llvm/test/CodeGen/AMDGPU/z_caller2.ll       | 57 +++++++++++++++++++++
 7 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c5207228dc913..6922a5e8abb8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -185,9 +185,12 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
 // Calling convention for leaf functions
 def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
-  CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
+  CCIfType<[i1] , CCAssignToReg<
+    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
+  >>,
+
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3d4adb16a2716..a519bb350f25c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3637,6 +3637,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
 
+  // In code below (after call of AnalyzeCallOperands),
+  // if (!Subtarget->enableFlatScratch()), it would use either s[48:51] or
+  // s[0:3]. Therefore, before calling AnalyzeCallOperands, we may need to
+  // reserve these registers.
+  if (!Subtarget->enableFlatScratch()) {
+    if (IsChainCallConv)
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+          AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+    else
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+          AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+  }
+
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 696e74cb592f9..2e8814a1ecaf4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -861,6 +861,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      // When calling convention allocates SGPR for i1 argument, we may
+      // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
+      // the copy to avoid illegal copy.
+      if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
+        auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
+        if (sub0 != DestReg)
+          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
+        return;
+      }
+
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -894,6 +904,19 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+      // When an i1 argument is allocated to an SGPR_32, we may have a COPY
+      // from SGPR_32 to SReg_64. The following handles this case to avoid
+      // an illegal copy.
+      if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+        auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
+        if (sub0 != SrcReg) {
+          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
+        }
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+        return;
+      }
+
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index cfa0c21def791..07cebab1b6c2a 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -479,6 +479,12 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
+      // When the calling convention allocates i1 argument to SGPR,
+      // we may have a COPY with dst being an SGPR_32. This should
+      // not be lowered into V_CNDMASK_B32.
+      if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+        continue;
+
       Changed = true;
 
       // Copy into a 32-bit vector register.
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
new file mode 100644
index 0000000000000..2fc4befa279f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
+; Therefore, the "s_mov_b32 s5, 0" is generated.
+;
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s5, 0
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
new file mode 100644
index 0000000000000..faf25e407fca2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1(i1) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+  call void @external_void_func_i1(i1 true)
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
new file mode 100644
index 0000000000000..e63ae50b7e91c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1_signext(i1 signext) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT:    s_endpgm
+  %var = load volatile i1, ptr addrspace(1) undef
+  call void @external_void_func_i1_signext(i1 signext %var)
+  ret void
+}
+
+
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

>From 94006a3ae1482e0b7320cde8685355ec93c6f2a5 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 20:37:27 -0600
Subject: [PATCH 2/9] Fix format.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  | 11 +++++++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp     | 13 +++++++------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp |  2 +-
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a519bb350f25c..7978d100082ae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3643,11 +3643,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   // reserve these registers.
   if (!Subtarget->enableFlatScratch()) {
     if (IsChainCallConv)
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
-          AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+      CCInfo.AllocateRegBlock(
+          ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,
+                              AMDGPU::SGPR51},
+          4);
     else
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
-          AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+                                                  AMDGPU::SGPR2, AMDGPU::SGPR3},
+                              4);
   }
 
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2e8814a1ecaf4..0c151fae02960 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -876,7 +876,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc));
+        .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
@@ -891,13 +891,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     if (DestReg == AMDGPU::VCC) {
       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addReg(SrcReg, getKillRegState(KillSrc));
       } else {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(KillSrc));
       }
 
       return;
@@ -907,13 +907,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       // When an i1 argument is allocated to an SGPR_32, we may have a COPY
       // from SGPR_32 to SReg_64. The following handles this case to avoid
       // an illegal copy.
-      if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+      if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
         auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
         if (sub0 != SrcReg) {
           BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
         }
         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+                RI.getSubReg(DestReg, AMDGPU::sub1))
+            .addImm(0);
         return;
       }
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 07cebab1b6c2a..27b4d07602092 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -482,7 +482,7 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
       // When the calling convention allocates i1 argument to SGPR,
       // we may have a COPY with dst being an SGPR_32. This should
       // not be lowered into V_CNDMASK_B32.
-      if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+      if (AMDGPU::SGPR_32RegClass.contains(DstReg))
         continue;
 
       Changed = true;

>From 27c3f478725218930eb445e46b157a5444af9fff Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 12:31:17 -0600
Subject: [PATCH 3/9] Creating a custom calling conv function for i1.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |  9 +--
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 31 +++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  9 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 24 ------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    | 13 +--
 llvm/test/CodeGen/AMDGPU/z_callee.ll          |  7 +-
 llvm/test/CodeGen/AMDGPU/z_caller.ll          |  6 +-
 llvm/test/CodeGen/AMDGPU/z_caller2.ll         |  4 +-
 llvm/test/CodeGen/AMDGPU/z_return.ll          | 80 +++++++++++++++++++
 9 files changed, 137 insertions(+), 46 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 6922a5e8abb8f..0966365e3a975 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,9 +187,7 @@ def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  CCIfType<[i1] , CCAssignToReg<
-    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
-  >>,
+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
 
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
@@ -205,8 +203,9 @@ def CC_AMDGPU_Func : CallingConv<[
 
 // Calling convention for leaf functions
 def RetCC_AMDGPU_Func : CallingConv<[
-  CCIfType<[i1], CCPromoteToType<i32>>,
-  CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b420e72d87ed0..dcb71ebea426b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,6 +29,37 @@
 
 using namespace llvm;
 
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
+                           MVT LocVT, CCValAssign::LocInfo LocInfo,
+                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+
+  static const MCPhysReg I1RegList1[] = {
+    AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
+    AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
+    AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+    AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+    AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
+  };
+
+  static const MCPhysReg I1RegList2[] = {
+    AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
+    AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
+    AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
+    AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
+    AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
+    AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
+    AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
+  };
+
+  assert (LocVT == MVT::i1);
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+  return false; // not allocated
+}
+
 #include "AMDGPUGenCallingConv.inc"
 
 static cl::opt<bool> AMDGPUBypassSlowDiv(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7978d100082ae..0371c21602c25 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2994,8 +2994,13 @@ SDValue SITargetLowering::LowerFormalArguments(
       RC = &AMDGPU::VGPR_32RegClass;
     else if (AMDGPU::SGPR_32RegClass.contains(Reg))
       RC = &AMDGPU::SGPR_32RegClass;
-    else
-      llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+    else {
+      if (VT == MVT::i1 && Subtarget->isWave64())
+        RC = &AMDGPU::SGPR_64RegClass;
+      else
+        llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+    }
+
     EVT ValVT = VA.getValVT();
 
     Reg = MF.addLiveIn(Reg, RC);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0c151fae02960..01c3f5bcbe0ea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -861,16 +861,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
-      // When calling convention allocates SGPR for i1 argument, we may
-      // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
-      // the copy to avoid illegal copy.
-      if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
-        auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
-        if (sub0 != DestReg)
-          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
-        return;
-      }
-
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -904,20 +894,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
-      // When an i1 argument is allocated to an SGPR_32, we may have a COPY
-      // from SGPR_32 to SReg_64. The following handles this case to avoid
-      // an illegal copy.
-      if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
-        auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
-        if (sub0 != SrcReg) {
-          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
-        }
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                RI.getSubReg(DestReg, AMDGPU::sub1))
-            .addImm(0);
-        return;
-      }
-
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 27b4d07602092..14d0dff31c5aa 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -479,12 +479,6 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
-      // When the calling convention allocates i1 argument to SGPR,
-      // we may have a COPY with dst being an SGPR_32. This should
-      // not be lowered into V_CNDMASK_B32.
-      if (AMDGPU::SGPR_32RegClass.contains(DstReg))
-        continue;
-
       Changed = true;
 
       // Copy into a 32-bit vector register.
@@ -693,6 +687,13 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
       assert(!MI.getOperand(1).getSubReg());
 
       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
+        if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
+          // return value is put in 64b SGPR.
+          assert(ST->isWave64());
+          continue;
+        }
+
         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
         Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
index 2fc4befa279f3..44af2c90f900b 100644
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 define void @void_func_i1(i1 %arg0) #0 {
 ; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
@@ -11,7 +11,6 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s5, 0
 ; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
index faf25e407fca2..f9203cf078e47 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 
 declare hidden void @external_void_func_i1(i1) #0
@@ -17,7 +17,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
index e63ae50b7e91c..1141476960250 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 
 declare hidden void @external_void_func_i1_signext(i1 signext) #0
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
new file mode 100644
index 0000000000000..6bf64da7a1b8f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_return.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define i1 @i1_func_void() #0 {
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_i1_func_void() #0 {
+; CIGFX89-LABEL: test_call_i1_func_void:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s6, s33
+; CIGFX89-NEXT:    s_mov_b32 s33, s32
+; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT:    s_addk_i32 s32, 0x400
+; CIGFX89-NEXT:    s_getpc_b64 s[4:5]
+; CIGFX89-NEXT:    s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
+; CIGFX89-NEXT:    s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
+; CIGFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIGFX89-NEXT:    v_writelane_b32 v1, s30, 0
+; CIGFX89-NEXT:    v_writelane_b32 v1, s31, 1
+; CIGFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; CIGFX89-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CIGFX89-NEXT:    global_store_byte v[2:3], v0, off
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    v_readlane_b32 s31, v1, 1
+; CIGFX89-NEXT:    v_readlane_b32 s30, v1, 0
+; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT:    buffer_load_dword v1, off, s[0:3], s33  ; 4-byte Folded Reload
+; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT:    s_addk_i32 s32, 0xfc00
+; CIGFX89-NEXT:    s_mov_b32 s33, s6
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v1, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v1, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    global_store_b8 v[2:3], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v1, off, s33           ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+
+  %val = call i1 @i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+

>From 865cdc2bc5dbc3132490d0574e511403dbcef9e0 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 20:04:19 -0600
Subject: [PATCH 4/9] Fix formatting.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 44 ++++++++++---------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    |  7 +--
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index dcb71ebea426b..91d7171dd8114 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,31 +29,33 @@
 
 using namespace llvm;
 
-static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
-                           MVT LocVT, CCValAssign::LocInfo LocInfo,
-                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                CCValAssign::LocInfo LocInfo,
+                                ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static bool IsWave64 = static_cast<const GCNSubtarget &>(
+                             State.getMachineFunction().getSubtarget())
+                             .isWave64();
 
   static const MCPhysReg I1RegList1[] = {
-    AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
-    AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
-    AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
-    AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
-    AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
-  };
+      AMDGPU::SGPR0_SGPR1,   AMDGPU::SGPR2_SGPR3,   AMDGPU::SGPR4_SGPR5,
+      AMDGPU::SGPR6_SGPR7,   AMDGPU::SGPR8_SGPR9,   AMDGPU::SGPR10_SGPR11,
+      AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+      AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+      AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
 
   static const MCPhysReg I1RegList2[] = {
-    AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
-    AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
-    AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
-    AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
-    AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
-    AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
-    AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
-  };
-
-  assert (LocVT == MVT::i1);
-  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+      AMDGPU::SGPR0,  AMDGPU::SGPR1,  AMDGPU::SGPR2,  AMDGPU::SGPR3,
+      AMDGPU::SGPR4,  AMDGPU::SGPR5,  AMDGPU::SGPR6,  AMDGPU::SGPR7,
+      AMDGPU::SGPR8,  AMDGPU::SGPR9,  AMDGPU::SGPR10, AMDGPU::SGPR11,
+      AMDGPU::SGPR12, AMDGPU::SGPR13, AMDGPU::SGPR14, AMDGPU::SGPR15,
+      AMDGPU::SGPR16, AMDGPU::SGPR17, AMDGPU::SGPR18, AMDGPU::SGPR19,
+      AMDGPU::SGPR20, AMDGPU::SGPR21, AMDGPU::SGPR22, AMDGPU::SGPR23,
+      AMDGPU::SGPR24, AMDGPU::SGPR25, AMDGPU::SGPR26, AMDGPU::SGPR27,
+      AMDGPU::SGPR28, AMDGPU::SGPR29};
+
+  assert(LocVT == MVT::i1);
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
+                              : State.AllocateReg(I1RegList2)) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 14d0dff31c5aa..e854ab29cae46 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -687,9 +687,10 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
       assert(!MI.getOperand(1).getSubReg());
 
       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
-        if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
-          // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
-          // return value is put in 64b SGPR.
+        if (!SrcReg.isVirtual() &&
+            TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with
+          // wavefront size 64, i1 return value is put in 64b SGPR.
           assert(ST->isWave64());
           continue;
         }

>From 1fe19d6028a3c9a5d4b95f9d4a27ec9c6f18d6d7 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 21 Dec 2023 16:13:47 -0600
Subject: [PATCH 5/9] Fixed (1) problems for global-isel wrt both incoming args
 and return value (2) a problem in AMDCallingConv.td when no sgprs are
 available.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  20 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  13 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   2 +-
 .../irtranslator-call-return-values.ll        |  20 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |  42 +-
 .../GlobalISel/irtranslator-function-args.ll  | 243 ++++++++++--
 .../GlobalISel/irtranslator-invariant.ll      |   6 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |  48 +--
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |   6 +-
 ...amdgpu-codegenprepare-fold-binop-select.ll | 278 ++++++-------
 llvm/test/CodeGen/AMDGPU/function-args.ll     | 370 +++++++++++++++---
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |   5 +
 llvm/test/CodeGen/AMDGPU/z_callee.ll          |  32 --
 llvm/test/CodeGen/AMDGPU/z_caller.ll          |  43 --
 llvm/test/CodeGen/AMDGPU/z_caller2.ll         |  57 ---
 llvm/test/CodeGen/AMDGPU/z_return.ll          |  80 ----
 17 files changed, 754 insertions(+), 513 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 6d05c3678bf09..c00021105b8c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -124,7 +124,15 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
     if (VA.getLocVT().getSizeInBits() < 32) {
       // 16-bit types are reported as legal for 32-bit registers. We need to do
       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
-      auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+      unsigned CopyToBits = 32;
+
+      // When function return type is i1, it may be in a 64b register.
+      if (VA.getLocVT().getSizeInBits() == 1) {
+        if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
+          CopyToBits = 64;
+      }
+
+      auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
 
       // If we have signext/zeroext, it applies to the whole 32-bit register
       // before truncation.
@@ -233,7 +241,15 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
-    Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+    Register ExtReg;
+
+    if (VA.getLocVT().getSizeInBits() == 1 &&
+        MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+      ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
+    } else {
+      ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+    }
+
     MIRBuilder.buildCopy(PhysReg, ExtReg);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 0966365e3a975..8a9c1512a73d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -189,6 +189,8 @@ def CC_AMDGPU_Func : CallingConv<[
 
   CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
 
+  CCIfType<[i1], CCPromoteToType<i32>>,
+
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 91d7171dd8114..a6ecfade9f9d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -32,18 +32,17 @@ using namespace llvm;
 static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
                                 CCValAssign::LocInfo LocInfo,
                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static bool IsWave64 = static_cast<const GCNSubtarget &>(
-                             State.getMachineFunction().getSubtarget())
-                             .isWave64();
+  static bool IsWave64 =
+      State.getMachineFunction().getSubtarget<GCNSubtarget>().isWave64();
 
-  static const MCPhysReg I1RegList1[] = {
+  static const MCPhysReg SGPRArgsWave64[] = {
       AMDGPU::SGPR0_SGPR1,   AMDGPU::SGPR2_SGPR3,   AMDGPU::SGPR4_SGPR5,
       AMDGPU::SGPR6_SGPR7,   AMDGPU::SGPR8_SGPR9,   AMDGPU::SGPR10_SGPR11,
       AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
       AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
       AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
 
-  static const MCPhysReg I1RegList2[] = {
+  static const MCPhysReg SGPRArgsWave32[] = {
       AMDGPU::SGPR0,  AMDGPU::SGPR1,  AMDGPU::SGPR2,  AMDGPU::SGPR3,
       AMDGPU::SGPR4,  AMDGPU::SGPR5,  AMDGPU::SGPR6,  AMDGPU::SGPR7,
       AMDGPU::SGPR8,  AMDGPU::SGPR9,  AMDGPU::SGPR10, AMDGPU::SGPR11,
@@ -54,8 +53,8 @@ static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
       AMDGPU::SGPR28, AMDGPU::SGPR29};
 
   assert(LocVT == MVT::i1);
-  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
-                              : State.AllocateReg(I1RegList2)) {
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64)
+                              : State.AllocateReg(SGPRArgsWave32)) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0371c21602c25..b4272466a187f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2996,7 +2996,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       RC = &AMDGPU::SGPR_32RegClass;
     else {
       if (VT == MVT::i1 && Subtarget->isWave64())
-        RC = &AMDGPU::SGPR_64RegClass;
+        RC = Subtarget->getBoolRC();
       else
         llvm_unreachable("Unexpected register class in LowerFormalArguments!");
     }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 8b0a006e29c00..67d87071e70e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -198,9 +198,9 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -275,10 +275,9 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
   ; GCN-NEXT:   G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
@@ -336,10 +335,9 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
   ; GCN-NEXT:   G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index cb0efc19169dc..93c8355a28c9f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -368,12 +368,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
+  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT]](s64)
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -381,7 +381,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   call void @external_void_func_i1(i1 true)
@@ -426,12 +426,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[SEXT]](s64)
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -439,7 +439,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   %var = load volatile i1, ptr addrspace(1) undef
@@ -485,12 +485,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
+  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -498,7 +498,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   %var = load volatile i1, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 2f0156d67bdfe..696bb34d601be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -3,6 +3,7 @@
 ; the frame info, so some functions have manually added stack object
 ; checks.
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=GFX1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
 ; FIXME: pre-VI should have same ABI without legal i16 operations.
 
 define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
@@ -34,10 +35,10 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
 define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -48,11 +49,10 @@ define void @void_func_i1(i1 %arg0) #0 {
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 1
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
@@ -68,11 +68,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 1
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
@@ -89,10 +88,10 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-LABEL: name: i1_arg_i1_use
   ; CHECK: bb.1.bb:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -1986,25 +1985,25 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28
   ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
-  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5)
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
-  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s1) from %fixed-stack.3, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
-  ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
-  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.2, align 8, addrspace 5)
-  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD2]](s16)
-  ; CHECK-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
-  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.1, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
-  ; CHECK-NEXT:   [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
+  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
+  ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.1, align 8, addrspace 5)
+  ; CHECK-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
   ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[LOAD2]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD4]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -3230,6 +3229,196 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
   store <2 x ptr addrspace(3)> %arg0, ptr addrspace(1) undef
+; Check calling convention for i1 args
+define void @many_i1_args(
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; CHECK-LABEL: name: many_i1_args
+; CHECK: bb.1 (%ir-block.0):
+; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
+; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
+; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; CHECK-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
+; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
+; CHECK-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
+; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+; CHECK-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
+; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
+; CHECK-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
+; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
+; CHECK-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
+; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
+; CHECK-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
+; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
+; CHECK-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
+; CHECK-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s64)
+; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
+; CHECK-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s64)
+; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; CHECK-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr8
+; CHECK-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; CHECK-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr9
+; CHECK-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; CHECK-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr10
+; CHECK-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; CHECK-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr11
+; CHECK-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; CHECK-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr12
+; CHECK-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr13
+; CHECK-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr14
+; CHECK-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr15
+; CHECK-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr16
+; CHECK-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; CHECK:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+;
+; GFX11-LABEL: name: many_i1_args
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i1 %arg2, ptr addrspace(1) undef
+  store volatile i1 %arg3, ptr addrspace(1) undef
+  store volatile i1 %arg4, ptr addrspace(1) undef
+  store volatile i1 %arg5, ptr addrspace(1) undef
+  store volatile i1 %arg6, ptr addrspace(1) undef
+  store volatile i1 %arg7, ptr addrspace(1) undef
+
+  store volatile i1 %arg8, ptr addrspace(1) undef
+  store volatile i1 %arg9, ptr addrspace(1) undef
+  store volatile i1 %arg10, ptr addrspace(1) undef
+  store volatile i1 %arg11, ptr addrspace(1) undef
+  store volatile i1 %arg12, ptr addrspace(1) undef
+  store volatile i1 %arg13, ptr addrspace(1) undef
+  store volatile i1 %arg14, ptr addrspace(1) undef
+  store volatile i1 %arg15, ptr addrspace(1) undef
+
+  store volatile i1 %arg16, ptr addrspace(1) undef
+  store volatile i1 %arg17, ptr addrspace(1) undef
+  store volatile i1 %arg18, ptr addrspace(1) undef
+  store volatile i1 %arg19, ptr addrspace(1) undef
+  store volatile i1 %arg20, ptr addrspace(1) undef
+  store volatile i1 %arg21, ptr addrspace(1) undef
+  store volatile i1 %arg22, ptr addrspace(1) undef
+  store volatile i1 %arg23, ptr addrspace(1) undef
+
+  store volatile i1 %arg24, ptr addrspace(1) undef
+  store volatile i1 %arg25, ptr addrspace(1) undef
+  store volatile i1 %arg26, ptr addrspace(1) undef
+  store volatile i1 %arg27, ptr addrspace(1) undef
+  store volatile i1 %arg28, ptr addrspace(1) undef
+  store volatile i1 %arg29, ptr addrspace(1) undef
+  store volatile i1 %arg30, ptr addrspace(1) undef
+  store volatile i1 %arg31, ptr addrspace(1) undef
+
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index ec07b0b1d4f45..ac1eb4e2adda0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -22,10 +22,10 @@ define i32 @load_const_i32_gv() {
 define i32 @load_select_const_i32_gv(i1 %cond) {
   ; CHECK-LABEL: name: load_select_const_i32_gv
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
   ; CHECK-NEXT:   [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
   ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[TRUNC]](s1), [[GV]], [[GV1]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index a5482bd5b79a9..c3b8a6b2b7526 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -10,8 +10,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -19,8 +19,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -28,32 +28,32 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f32:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W64-LABEL: v_div_fmas_f32:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W32-LABEL: v_div_fmas_f32:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W64-LABEL: v_div_fmas_f32:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX11_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W64-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
@@ -64,8 +64,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -73,8 +73,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -82,32 +82,32 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W64-LABEL: v_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W32-LABEL: v_div_fmas_f64:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX11_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W64-LABEL: v_div_fmas_f64:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX11_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX11_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11_W64-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 36bac87889cac..1cff9ba4d2340 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,9 +168,9 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-LABEL: localize_internal_globals:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    s_and_b32 s4, 1, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 5c40a4ce13e31..9beec51710598 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -10,11 +10,10 @@ define i32 @select_sdiv_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1e848
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x30d40
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0x30d40
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1e848
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 8
   %op = sdiv i32 1000000, %select
@@ -29,11 +28,10 @@ define i32 @select_sdiv_rhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x2710
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x3e8
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_movk_i32 s6, 0x3e8
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x2710
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 42000, i32 420000
   %op = sdiv i32 %select, 42
@@ -48,11 +46,10 @@ define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_const_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x22b
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x29a
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_movk_i32 s6, 0x29a
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x22b
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x594
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, <2 x i32> <i32 5, i32 undef>, <2 x i32> <i32 6, i32 7>
@@ -68,14 +65,13 @@ define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_const_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3661c
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x307dd
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x23b02a
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x13e3a0c
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_mov_b32 s6, 0x307dd
+; GCN-NEXT:    s_mov_b32 s5, 0x13e3a0c
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x3661c
+; GCN-NEXT:    s_cselect_b32 s5, s5, 0x23b02a
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, <2 x i32> <i32 8342123, i32 834212353>, <2 x i32> <i32 9355456, i32 93554321>
   %op = sdiv <2 x i32> %select, <i32 42, i32 40>
@@ -126,40 +122,41 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_opaque_const0_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v1, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_cselect_b32 s4, s6, 5
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_sub_i32 s6, 0, s4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT:    s_mov_b32 s6, 0xf4240
+; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT:    s_mov_b32 s4, 0xf4240
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    s_mul_i32 s6, s6, s4
+; GCN-NEXT:    s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT:    s_sub_i32 s7, s6, s4
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s6, s7, s6
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, s5, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
   %op = sdiv i32 1000000, %select
@@ -208,40 +205,41 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_opaque_const1_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 5, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_cselect_b32 s4, 5, s6
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_sub_i32 s6, 0, s4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT:    s_mov_b32 s6, 0xf4240
+; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT:    s_mov_b32 s4, 0xf4240
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    s_mul_i32 s6, s6, s4
+; GCN-NEXT:    s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT:    s_sub_i32 s7, s6, s4
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s6, s7, s6
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, s5, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
   %op = sdiv i32 1000000, %select
@@ -257,18 +255,15 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_opaque_const0_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x392fa
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x30c30c31
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT:    v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x392fa
+; GCN-NEXT:    v_mul_hi_i32 v0, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -287,18 +282,15 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_opaque_const1_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xa410
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x30c30c31
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT:    v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT:    s_cselect_b32 s4, 0xa410, s6
+; GCN-NEXT:    v_mul_hi_i32 v0, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -316,11 +308,10 @@ define i32 @select_add_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xf4248
-; GCN-NEXT:    v_mov_b32_e32 v2, 0xf4245
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0xf4245
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0xf4248
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 8
   %op = add i32 1000000, %select
@@ -335,11 +326,9 @@ define float @select_fadd_lhs_const_i32_fmf(i1 %cond) {
 ; GCN-LABEL: select_fadd_lhs_const_i32_fmf:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x40a00000
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x40400000
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40a00000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, float 2.0, float 4.0
   %op = fadd nnan nsz float 1.0, %select
@@ -351,12 +340,10 @@ define i32 @select_mul_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_mul_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_movk_i32 s6, 0x1388
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; IR-LABEL: @select_mul_lhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
@@ -370,12 +357,10 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_mul_rhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_movk_i32 s6, 0x1388
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; IR-LABEL: @select_mul_rhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
@@ -411,9 +396,7 @@ define i16 @select_add_trunc_select(i1 %cond) {
 ; GCN-LABEL: select_add_trunc_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; IR-LABEL: @select_add_trunc_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
@@ -432,9 +415,9 @@ define i32 @select_add_sext_select(i1 %cond) {
 ; GCN-LABEL: select_add_sext_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 29, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, 29, 50
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 -13, i16 8
   %trunc = sext i16 %select to i32
@@ -450,9 +433,9 @@ define i32 @select_add_zext_select(i1 %cond) {
 ; GCN-LABEL: select_add_zext_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, 47, 50
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 5, i16 8
   %trunc = zext i16 %select to i32
@@ -468,11 +451,10 @@ define i32 @select_add_bitcast_select(i1 %cond) {
 ; GCN-LABEL: select_add_bitcast_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000002a
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x3f80002a
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0x3f80002a
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x4000002a
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, float 1.0, float 2.0
   %trunc = bitcast float %select to i32
@@ -493,10 +475,8 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
 ; GCN-LABEL: multi_use_cast_regression:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c00
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_sub_f32_e32 v1, 1.0, v0
 ; GCN-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 4963dc517574d..ab3ad0e8c0444 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -8,7 +8,7 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -18,7 +18,7 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; GFX11-LABEL: void_func_i1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -31,6 +31,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1_zeroext:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    v_or_b32_e32 v0, 12, v0
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
@@ -41,9 +42,11 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 ; GFX11-LABEL: void_func_i1_zeroext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i1 %arg0 to i32
@@ -56,7 +59,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; CI-LABEL: void_func_i1_signext:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, 12, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -66,7 +70,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; VI-LABEL: void_func_i1_signext:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, 12, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -76,7 +81,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; GFX9-LABEL: void_func_i1_signext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v0, 12, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -86,9 +92,11 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; GFX11-LABEL: void_func_i1_signext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 12, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i1 %arg0 to i32
@@ -101,9 +109,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-LABEL: i1_arg_i1_use:
 ; CIGFX89:       ; %bb.0: ; %bb
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
-; CIGFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CIGFX89-NEXT:    s_xor_b64 s[6:7], vcc, -1
+; CIGFX89-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; CIGFX89-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
 ; CIGFX89-NEXT:    s_cbranch_execz .LBB3_2
 ; CIGFX89-NEXT:  ; %bb.1: ; %bb1
@@ -120,11 +126,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; GFX11-LABEL: i1_arg_i1_use:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT:    s_xor_b32 s1, s0, -1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s1
 ; GFX11-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb1
@@ -2775,13 +2779,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2790,14 +2792,15 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_and_b32_e32 v0, 1, v17
-; CI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2818,13 +2821,12 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2833,14 +2835,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, 1, v20
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2859,15 +2860,12 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2876,14 +2874,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2892,16 +2889,15 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -2910,8 +2906,6 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
 ; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2920,7 +2914,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
@@ -4628,7 +4622,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
   ret void
 }
 
-
 define void @void_func_bf16(bfloat %arg0) #0 {
 ; CI-LABEL: void_func_bf16:
 ; CI:       ; %bb.0:
@@ -4851,4 +4844,275 @@ define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 {
   ret void
 }
 
+define void @many_i1_args(
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: many_i1_args:
+; GFX9:      ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, vcc
+; GFX9-NEXT:    v_writelane_b32 v19, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v19, s31, 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s31, 0xf000
+; GFX9-NEXT:    s_mov_b32 s30, -1
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[10:11]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[12:13]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[14:15]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[16:17]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[18:19]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[20:21]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[22:23]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[24:25]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[26:27]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[28:29]
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v19, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v19, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: many_i1_args:
+; GFX11:      ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s32          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT:    s_mov_b32 s30, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_mov_b32 s31, 0x31016000
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s11
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s16
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s21
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s26
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s29
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32           ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i1 %arg2, ptr addrspace(1) undef
+  store volatile i1 %arg3, ptr addrspace(1) undef
+  store volatile i1 %arg4, ptr addrspace(1) undef
+  store volatile i1 %arg5, ptr addrspace(1) undef
+  store volatile i1 %arg6, ptr addrspace(1) undef
+  store volatile i1 %arg7, ptr addrspace(1) undef
+
+  store volatile i1 %arg8, ptr addrspace(1) undef
+  store volatile i1 %arg9, ptr addrspace(1) undef
+  store volatile i1 %arg10, ptr addrspace(1) undef
+  store volatile i1 %arg11, ptr addrspace(1) undef
+  store volatile i1 %arg12, ptr addrspace(1) undef
+  store volatile i1 %arg13, ptr addrspace(1) undef
+  store volatile i1 %arg14, ptr addrspace(1) undef
+  store volatile i1 %arg15, ptr addrspace(1) undef
+
+  store volatile i1 %arg16, ptr addrspace(1) undef
+  store volatile i1 %arg17, ptr addrspace(1) undef
+  store volatile i1 %arg18, ptr addrspace(1) undef
+  store volatile i1 %arg19, ptr addrspace(1) undef
+  store volatile i1 %arg20, ptr addrspace(1) undef
+  store volatile i1 %arg21, ptr addrspace(1) undef
+  store volatile i1 %arg22, ptr addrspace(1) undef
+  store volatile i1 %arg23, ptr addrspace(1) undef
+
+  store volatile i1 %arg24, ptr addrspace(1) undef
+  store volatile i1 %arg25, ptr addrspace(1) undef
+  store volatile i1 %arg26, ptr addrspace(1) undef
+  store volatile i1 %arg27, ptr addrspace(1) undef
+  store volatile i1 %arg28, ptr addrspace(1) undef
+  store volatile i1 %arg29, ptr addrspace(1) undef
+  store volatile i1 %arg30, ptr addrspace(1) undef
+  store volatile i1 %arg31, ptr addrspace(1) undef
+
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee2798171..fb4b89c4e8d00 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -12,6 +12,8 @@ define i1 @i1_func_void() #0 {
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_func_void:
@@ -21,6 +23,9 @@ define i1 @i1_func_void() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
deleted file mode 100644
index 44af2c90f900b..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define void @void_func_i1(i1 %arg0) #0 {
-; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
-; Therefore, the "s_mov_b32 s5, 0" is generated.
-;
-; CIGFX89-LABEL: void_func_i1:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: void_func_i1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i1 %arg0, ptr addrspace(1) undef
-  ret void
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
deleted file mode 100644
index f9203cf078e47..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1(i1) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b64 s[4:5], -1
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_imm:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s0, -1
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_endpgm
-  call void @external_void_func_i1(i1 true)
-  ret void
-}
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
deleted file mode 100644
index 1141476960250..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1_signext(i1 signext) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_signext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_signext:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GFX11-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11-NEXT:    s_mov_b32 s0, s2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX11-NEXT:    s_endpgm
-  %var = load volatile i1, ptr addrspace(1) undef
-  call void @external_void_func_i1_signext(i1 signext %var)
-  ret void
-}
-
-
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
deleted file mode 100644
index 6bf64da7a1b8f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_return.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define i1 @i1_func_void() #0 {
-  %val = load i1, ptr addrspace(1) undef
-  ret i1 %val
-}
-
-define void @test_call_i1_func_void() #0 {
-; CIGFX89-LABEL: test_call_i1_func_void:
-; CIGFX89: ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s6, s33
-; CIGFX89-NEXT:    s_mov_b32 s33, s32
-; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
-; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT:    s_addk_i32 s32, 0x400
-; CIGFX89-NEXT:    s_getpc_b64 s[4:5]
-; CIGFX89-NEXT:    s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
-; CIGFX89-NEXT:    s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
-; CIGFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CIGFX89-NEXT:    v_writelane_b32 v1, s30, 0
-; CIGFX89-NEXT:    v_writelane_b32 v1, s31, 1
-; CIGFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; CIGFX89-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CIGFX89-NEXT:    global_store_byte v[2:3], v0, off
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    v_readlane_b32 s31, v1, 1
-; CIGFX89-NEXT:    v_readlane_b32 s30, v1, 0
-; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT:    buffer_load_dword v1, off, s[0:3], s33  ; 4-byte Folded Reload
-; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT:    s_addk_i32 s32, 0xfc00
-; CIGFX89-NEXT:    s_mov_b32 s33, s6
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_i1_func_void:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v1, s33          ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    global_store_b8 v[2:3], v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v1, off, s33           ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-
-  %val = call i1 @i1_func_void()
-  store volatile i1 %val, ptr addrspace(1) undef
-  ret void
-}
-
-attributes #0 = { nounwind }
-
-

>From 089e49452253a630436f3b4262ff2fe3978636be Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Tue, 16 Jan 2024 16:22:20 -0600
Subject: [PATCH 6/9] Minor changes based on code review.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 11 +++++------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index c00021105b8c1..95cbd9bab2da4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -127,7 +127,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
       unsigned CopyToBits = 32;
 
       // When function return type is i1, it may be in a 64b register.
-      if (VA.getLocVT().getSizeInBits() == 1) {
+      if (VA.getLocVT() == MVT::i1) {
         if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
           CopyToBits = 64;
       }
@@ -241,15 +241,14 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
-    Register ExtReg;
 
-    if (VA.getLocVT().getSizeInBits() == 1 &&
+    if (VA.getLocVT() == MVT::i1 &&
         MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
-      ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
-    } else {
-      ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+      MIRBuilder.buildCopy(PhysReg, ValVReg);
+      return;
     }
 
+    Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b4272466a187f..ea1d46c339389 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2995,7 +2995,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     else if (AMDGPU::SGPR_32RegClass.contains(Reg))
       RC = &AMDGPU::SGPR_32RegClass;
     else {
-      if (VT == MVT::i1 && Subtarget->isWave64())
+      if (VT == MVT::i1)
         RC = Subtarget->getBoolRC();
       else
         llvm_unreachable("Unexpected register class in LowerFormalArguments!");

>From d01a9417e05d9cd1bd3a4a7e464e3e66545bae98 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 22 Jan 2024 16:23:43 -0600
Subject: [PATCH 7/9] Additional change based on code review.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 95cbd9bab2da4..da6a365af7c37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -128,7 +128,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
 
       // When function return type is i1, it may be in a 64b register.
       if (VA.getLocVT() == MVT::i1) {
-        if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
+        if (MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
           CopyToBits = 64;
       }
 
@@ -243,7 +243,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
     MIB.addUse(PhysReg, RegState::Implicit);
 
     if (VA.getLocVT() == MVT::i1 &&
-        MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+        MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
       MIRBuilder.buildCopy(PhysReg, ValVReg);
       return;
     }

>From 0acda8cf8d7bebf15702a7b66b8a710201c67e4b Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 31 Jan 2024 12:54:51 -0600
Subject: [PATCH 8/9] Changing a vector of 4 registers to a single register.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ea1d46c339389..b7e8679b103ac 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3648,14 +3648,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   // reserve these registers.
   if (!Subtarget->enableFlatScratch()) {
     if (IsChainCallConv)
-      CCInfo.AllocateRegBlock(
-          ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,
-                              AMDGPU::SGPR51},
-          4);
+      CCInfo.AllocateReg(AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51);
     else
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
-                                                  AMDGPU::SGPR2, AMDGPU::SGPR3},
-                              4);
+      CCInfo.AllocateReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);
   }
 
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);

>From 822d2cd3aabcc39f598a87e37d704dbfa15c512c Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 2 Feb 2024 16:07:43 -0600
Subject: [PATCH 9/9] Update some test files.

---
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |   21 +-
 .../GlobalISel/irtranslator-function-args.ll  |  173 +-
 .../GlobalISel/irtranslator-invariant.ll      |    4 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |   16 +-
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |    2 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 1622 +++++++----------
 llvm/test/CodeGen/AMDGPU/call-args-inreg.ll   |  325 ++--
 .../CodeGen/AMDGPU/call-argument-types.ll     |  155 +-
 .../CodeGen/AMDGPU/combine_andor_with_cmps.ll |  474 ++---
 .../dagcombine-v1i8-extractvecelt-crash.ll    |   13 +-
 .../AMDGPU/divergence-driven-trunc-to-i1.ll   |   42 +-
 llvm/test/CodeGen/AMDGPU/extract-load-i1.ll   |    2 +
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |  130 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |  344 ++--
 14 files changed, 1392 insertions(+), 1931 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 93c8355a28c9f..44746ed11c64a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -368,12 +368,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT]](s64)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[C]](s1)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -426,12 +425,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[SEXT]](s64)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
@@ -485,12 +483,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[LOAD]](s1)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
   ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 696bb34d601be..e24f32b941b04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -35,9 +35,9 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
 define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -49,9 +49,9 @@ define void @void_func_i1(i1 %arg0) #0 {
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -68,9 +68,9 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -88,9 +88,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-LABEL: name: i1_arg_i1_use
   ; CHECK: bb.1.bb:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr16_sgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -1988,7 +1988,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
@@ -2781,8 +2781,8 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -3229,6 +3229,9 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
   store <2 x ptr addrspace(3)> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
 ; Check calling convention for i1 args
 define void @many_i1_args(
   i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
@@ -3237,71 +3240,71 @@ define void @many_i1_args(
   i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
 ; CHECK-LABEL: name: many_i1_args
 ; CHECK: bb.1 (%ir-block.0):
-; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
 ; CHECK-NEXT: {{  $}}
-; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
 ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
 ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
-; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
 ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
 ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
-; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
 ; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
-; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
 ; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
-; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
 ; CHECK-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
-; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
-; CHECK-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
-; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-; CHECK-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
-; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
-; CHECK-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
-; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
-; CHECK-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
-; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
-; CHECK-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
-; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
-; CHECK-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
-; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
-; CHECK-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s64)
-; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
-; CHECK-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s64)
-; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr8
 ; CHECK-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr9
 ; CHECK-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr10
 ; CHECK-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr11
 ; CHECK-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr12
 ; CHECK-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr13
 ; CHECK-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr14
 ; CHECK-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr15
 ; CHECK-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; CHECK-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr8
+; CHECK-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr16
 ; CHECK-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; CHECK-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr9
+; CHECK-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr17
 ; CHECK-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; CHECK-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr10
+; CHECK-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr18
 ; CHECK-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; CHECK-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr11
+; CHECK-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr19
 ; CHECK-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; CHECK-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr12
+; CHECK-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr20
 ; CHECK-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr13
+; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr21
 ; CHECK-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr14
+; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr22
 ; CHECK-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr15
+; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr23
 ; CHECK-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr16
+; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr24
 ; CHECK-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
 ;
 ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -3311,71 +3314,71 @@ define void @many_i1_args(
 ;
 ; GFX11-LABEL: name: many_i1_args
 ; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
 ; GFX11-NEXT: {{  $}}
-; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
 ; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr17
 ; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
-; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr18
 ; GFX11-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
-; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr19
 ; GFX11-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
-; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr20
 ; GFX11-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
-; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr21
 ; GFX11-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
-; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr22
 ; GFX11-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
-; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr23
 ; GFX11-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr24
 ; GFX11-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr25
 ; GFX11-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr26
 ; GFX11-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr27
 ; GFX11-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr28
 ; GFX11-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr29
 ; GFX11-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; GFX11-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr1
 ; GFX11-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr2
 ; GFX11-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr3
 ; GFX11-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr4
 ; GFX11-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr5
 ; GFX11-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr6
 ; GFX11-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr7
 ; GFX11-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr8
 ; GFX11-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr9
 ; GFX11-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr10
 ; GFX11-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr11
 ; GFX11-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr12
 ; GFX11-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr13
 ; GFX11-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr14
 ; GFX11-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr15
 ; GFX11-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr16
 ; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr17
 ; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
 ;
 ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index ac1eb4e2adda0..6360c5c2cbb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -22,9 +22,9 @@ define i32 @load_const_i32_gv() {
 define i32 @load_select_const_i32_gv(i1 %cond) {
   ; CHECK-LABEL: name: load_select_const_i32_gv
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
   ; CHECK-NEXT:   [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index c3b8a6b2b7526..ef0c47637015c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -10,7 +10,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    s_and_b32 s4, 1, s4
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
@@ -19,7 +19,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
@@ -28,7 +28,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f32:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
@@ -36,7 +36,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX10_W64-LABEL: v_div_fmas_f32:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
@@ -64,7 +64,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    s_and_b32 s4, 1, s4
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
@@ -73,7 +73,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
@@ -82,7 +82,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
@@ -90,7 +90,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX10_W64-LABEL: v_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 1cff9ba4d2340..4d04d6b7570c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,7 +168,7 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-LABEL: localize_internal_globals:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, 1, s0
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 2f7190e761102..0ca007c2e84d5 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -18856,37 +18856,37 @@ define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_fcmp_false_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_false_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_false_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_false_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_false_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_false_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp false bfloat %a, %b
   ret i1 %op
@@ -18898,8 +18898,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_oeq_bf16:
@@ -18907,8 +18906,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_oeq_bf16:
@@ -18916,8 +18914,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_oeq_bf16:
@@ -18925,8 +18922,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_oeq_bf16:
@@ -18934,8 +18930,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_oeq_bf16:
@@ -18944,8 +18939,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp oeq bfloat %a, %b
   ret i1 %op
@@ -18957,8 +18951,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ogt_bf16:
@@ -18966,8 +18959,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ogt_bf16:
@@ -18975,8 +18967,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ogt_bf16:
@@ -18984,8 +18975,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ogt_bf16:
@@ -18993,8 +18983,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ogt_bf16:
@@ -19003,8 +18992,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ogt bfloat %a, %b
   ret i1 %op
@@ -19016,8 +19004,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_oge_bf16:
@@ -19025,8 +19012,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_oge_bf16:
@@ -19034,8 +19020,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_oge_bf16:
@@ -19043,8 +19028,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_oge_bf16:
@@ -19052,8 +19036,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_oge_bf16:
@@ -19062,8 +19045,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp oge bfloat %a, %b
   ret i1 %op
@@ -19075,8 +19057,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_olt_bf16:
@@ -19084,8 +19065,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_olt_bf16:
@@ -19093,8 +19073,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_olt_bf16:
@@ -19102,8 +19081,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_olt_bf16:
@@ -19111,8 +19089,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_olt_bf16:
@@ -19121,8 +19098,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp olt bfloat %a, %b
   ret i1 %op
@@ -19134,8 +19110,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ole_bf16:
@@ -19143,8 +19118,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ole_bf16:
@@ -19152,8 +19126,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ole_bf16:
@@ -19161,8 +19134,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ole_bf16:
@@ -19170,8 +19142,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ole_bf16:
@@ -19180,8 +19151,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_le_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ole bfloat %a, %b
   ret i1 %op
@@ -19193,8 +19163,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_one_bf16:
@@ -19202,8 +19171,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_one_bf16:
@@ -19211,8 +19179,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_one_bf16:
@@ -19220,8 +19187,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lg_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_one_bf16:
@@ -19229,8 +19195,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lg_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_one_bf16:
@@ -19239,8 +19204,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lg_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp one bfloat %a, %b
   ret i1 %op
@@ -19252,8 +19216,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_uno_bf16:
@@ -19261,8 +19224,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_uno_bf16:
@@ -19270,8 +19232,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_uno_bf16:
@@ -19279,8 +19240,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_uno_bf16:
@@ -19288,8 +19248,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_uno_bf16:
@@ -19298,8 +19257,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp uno bfloat %a, %b
   ret i1 %op
@@ -19311,8 +19269,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ueq_bf16:
@@ -19320,8 +19277,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ueq_bf16:
@@ -19329,8 +19285,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ueq_bf16:
@@ -19338,8 +19293,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_nlg_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ueq_bf16:
@@ -19347,8 +19301,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlg_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ueq_bf16:
@@ -19357,8 +19310,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlg_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ueq bfloat %a, %b
   ret i1 %op
@@ -19370,8 +19322,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ugt_bf16:
@@ -19379,8 +19330,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ugt_bf16:
@@ -19388,8 +19338,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ugt_bf16:
@@ -19397,8 +19346,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_nle_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ugt_bf16:
@@ -19406,8 +19354,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nle_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ugt_bf16:
@@ -19416,8 +19363,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nle_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ugt bfloat %a, %b
   ret i1 %op
@@ -19429,8 +19375,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_uge_bf16:
@@ -19438,8 +19383,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_uge_bf16:
@@ -19447,8 +19391,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_uge_bf16:
@@ -19456,8 +19399,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_uge_bf16:
@@ -19465,8 +19407,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_uge_bf16:
@@ -19475,8 +19416,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp uge bfloat %a, %b
   ret i1 %op
@@ -19488,8 +19428,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ult_bf16:
@@ -19497,8 +19436,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ult_bf16:
@@ -19506,8 +19444,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ult_bf16:
@@ -19515,8 +19452,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_nge_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ult_bf16:
@@ -19524,8 +19460,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nge_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ult_bf16:
@@ -19534,8 +19469,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ult bfloat %a, %b
   ret i1 %op
@@ -19547,8 +19481,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_ule_bf16:
@@ -19556,8 +19489,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_ule_bf16:
@@ -19565,8 +19497,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_ule_bf16:
@@ -19574,8 +19505,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ule_bf16:
@@ -19583,8 +19513,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_ule_bf16:
@@ -19593,8 +19522,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ule bfloat %a, %b
   ret i1 %op
@@ -19606,8 +19534,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_une_bf16:
@@ -19615,8 +19542,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_une_bf16:
@@ -19624,8 +19550,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_une_bf16:
@@ -19633,8 +19558,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_neq_f32_e64 s[0:1], v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_une_bf16:
@@ -19642,8 +19566,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f32_e64 s0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_une_bf16:
@@ -19652,8 +19575,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f32_e64 s0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp une bfloat %a, %b
   ret i1 %op
@@ -19663,37 +19585,37 @@ define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_fcmp_true_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_mov_b64 s[0:1], -1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fcmp_true_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, 1
+; GFX7-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fcmp_true_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, 1
+; GFX8-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fcmp_true_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_true_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    s_mov_b32 s0, -1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fcmp_true_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp true bfloat %a, %b
   ret i1 %op
@@ -24742,52 +24664,39 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, bfloat %a, bfloat %b
   ret bfloat %op
@@ -24797,58 +24706,46 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_fneg_lhs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_fneg_lhs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_fneg_lhs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_lhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_lhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg bfloat %a
   %op = select i1 %cond, bfloat %neg.a, bfloat %b
@@ -24859,58 +24756,46 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_select_fneg_rhs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_fneg_rhs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_fneg_rhs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_rhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_rhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg bfloat %b
   %op = select i1 %cond, bfloat %a, bfloat %neg.b
@@ -24921,81 +24806,69 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GCN-LABEL: v_select_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v2, s4
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v2, s0
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
@@ -25371,71 +25244,59 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v1, v4, v3, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, v2, s[4:5]
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_select_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v3, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v5, v2, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_select_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
   ret <3 x bfloat> %op
@@ -25445,18 +25306,16 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GCN-LABEL: v_select_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT:    v_alignbit_b32 v4, v8, v7, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v7, v6, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -25466,18 +25325,16 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX7-LABEL: v_select_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v5, 16
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -25487,37 +25344,29 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX8-LABEL: v_select_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -25527,23 +25376,21 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GCN-LABEL: v_select_v6bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v8, v7, 16
-; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT:    v_alignbit_b32 v4, v10, v9, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v6, v12, v11, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v9, v8, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v5, v11, v10, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -25555,23 +25402,21 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX7-LABEL: v_select_v6bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v10
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v9, 16
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v11, 16
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v10, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -25583,41 +25428,33 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX8-LABEL: v_select_v6bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v6bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v6bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v6bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
   ret <6 x bfloat> %op
@@ -25627,28 +25464,26 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GCN-LABEL: v_select_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v10, v9, 16
-; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT:    v_alignbit_b32 v4, v12, v11, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v6, v14, v13, 16
-; GCN-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GCN-NEXT:    v_alignbit_b32 v8, v16, v15, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v1, v9, v8, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v11, v10, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v5, v13, v12, 16
+; GCN-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v7, v15, v14, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -25662,28 +25497,26 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX7-LABEL: v_select_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
-; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX7-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v16
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v9, 16
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v11, 16
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v13, 16
-; GFX7-NEXT:    v_alignbit_b32 v8, v8, v15, 16
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v10, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v12, 16
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v14, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -25697,44 +25530,37 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX8-LABEL: v_select_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -25744,47 +25570,44 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GCN-LABEL: v_select_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GCN-NEXT:    v_alignbit_b32 v2, v2, v17, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
-; GCN-NEXT:    v_alignbit_b32 v4, v4, v19, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
-; GCN-NEXT:    v_alignbit_b32 v6, v6, v21, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v24
-; GCN-NEXT:    v_alignbit_b32 v8, v8, v23, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
-; GCN-NEXT:    v_alignbit_b32 v10, v10, v25, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v28
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v30
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_alignbit_b32 v12, v17, v27, 16
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; GCN-NEXT:    v_alignbit_b32 v14, v18, v29, 16
-; GCN-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v13, v14, v13, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GCN-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v5, v5, v20, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GCN-NEXT:    v_alignbit_b32 v7, v7, v22, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v8, v9, v8, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v24, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v27
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_alignbit_b32 v10, v11, v10, 16
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GCN-NEXT:    v_alignbit_b32 v11, v16, v26, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v13, v12, 16
+; GCN-NEXT:    v_alignbit_b32 v13, v17, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v14, v15, v14, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v8, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -25799,11 +25622,10 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_alignbit_b32 v14, v14, v17, 16
-; GCN-NEXT:    v_cndmask_b32_e32 v15, v14, v15, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GCN-NEXT:    v_alignbit_b32 v15, v15, v30, 16
+; GCN-NEXT:    v_cndmask_b32_e64 v15, v15, v14, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -25811,47 +25633,44 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX7-LABEL: v_select_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v24
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    v_alignbit_b32 v8, v8, v23, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v17, 16
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v19, 16
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
-; GFX7-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
-; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v28
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v21, 16
-; GFX7-NEXT:    v_alignbit_b32 v10, v10, v25, 16
-; GFX7-NEXT:    v_alignbit_b32 v17, v17, v27, 16
-; GFX7-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; GFX7-NEXT:    v_alignbit_b32 v14, v19, v29, 16
-; GFX7-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; GFX7-NEXT:    v_cndmask_b32_e32 v13, v14, v13, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v10, v11, v10, 16
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v27
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v29
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v16, v16, v26, 16
+; GFX7-NEXT:    v_alignbit_b32 v12, v13, v12, 16
+; GFX7-NEXT:    v_alignbit_b32 v13, v17, v28, 16
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GFX7-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GFX7-NEXT:    v_alignbit_b32 v8, v9, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v16, v10, s[4:5]
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v18, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v20, 16
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v22, 16
+; GFX7-NEXT:    v_alignbit_b32 v9, v9, v24, 16
+; GFX7-NEXT:    v_alignbit_b32 v14, v15, v14, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -25860,17 +25679,16 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_alignbit_b32 v8, v8, v18, 16
-; GFX7-NEXT:    v_cndmask_b32_e32 v15, v8, v15, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v10, v10, v30, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v10, v14, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -25878,58 +25696,53 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX8-LABEL: v_select_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -25939,156 +25752,152 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GCN-LABEL: v_select_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GCN-NEXT:    v_alignbit_b32 v2, v2, v3, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GCN-NEXT:    v_alignbit_b32 v3, v3, v5, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v8
-; GCN-NEXT:    v_alignbit_b32 v4, v4, v7, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GCN-NEXT:    v_alignbit_b32 v5, v5, v9, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
-; GCN-NEXT:    v_alignbit_b32 v6, v6, v11, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
-; GCN-NEXT:    v_alignbit_b32 v7, v7, v13, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v16
-; GCN-NEXT:    v_alignbit_b32 v8, v8, v15, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GCN-NEXT:    v_alignbit_b32 v9, v9, v17, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v20
-; GCN-NEXT:    v_alignbit_b32 v10, v10, v19, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v22
-; GCN-NEXT:    v_alignbit_b32 v11, v11, v21, 16
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:12
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v24
-; GCN-NEXT:    v_alignbit_b32 v12, v12, v23, 16
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v26
-; GCN-NEXT:    v_alignbit_b32 v13, v13, v25, 16
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v28
-; GCN-NEXT:    v_alignbit_b32 v14, v14, v27, 16
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v30
-; GCN-NEXT:    v_alignbit_b32 v15, v15, v29, 16
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
-; GCN-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    v_alignbit_b32 v0, v0, v17, 16
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v5, v5, v10, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v6, v6, v12, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; GCN-NEXT:    v_alignbit_b32 v7, v7, v14, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v10, v10, v20, 16
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v22, 16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v25
+; GCN-NEXT:    v_alignbit_b32 v12, v12, v24, 16
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v27
+; GCN-NEXT:    v_alignbit_b32 v13, v13, v26, 16
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v29
+; GCN-NEXT:    v_alignbit_b32 v14, v14, v28, 16
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:24
 ; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_alignbit_b32 v16, v16, v19, 16
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v17
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_alignbit_b32 v17, v17, v21, 16
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v17, v17, v20, 16
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:28
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:40
-; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_alignbit_b32 v18, v21, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v18, v18, v19, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_alignbit_b32 v19, v19, v20, 16
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:52
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:48
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:56
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v19, v19, v21, 16
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:52
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v20, v20, v21, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_alignbit_b32 v21, v21, v23, 16
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:68
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:64
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v22, v22, v23, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v24
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:80
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_alignbit_b32 v23, v23, v25, 16
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:92
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:88
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:84
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v24, v24, v25, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v26
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_alignbit_b32 v25, v25, v27, 16
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:96
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:104
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:104
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:100
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v26, v26, v27, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v28
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_alignbit_b32 v27, v27, v29, 16
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_alignbit_b32 v28, v28, v29, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_alignbit_b32 v29, v29, v31, 16
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v31
 ; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v29, v29, v32, 16
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_alignbit_b32 v30, v30, v31, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:    v_alignbit_b32 v30, v31, v30, 16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v31, v31, v33, 16
-; GCN-NEXT:    v_cndmask_b32_e32 v31, v31, v30, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v29, v29, v15, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v28, v28, v14, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v27, v27, v13, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v26, v26, v12, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v25, v25, v11, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v24, v24, v10, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v23, v23, v9, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v15, v22, v8, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v13, v21, v7, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v11, v20, v6, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v19, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v18, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v17, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v16, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v31, v31, v30, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v29, v29, v14, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v28, v28, v13, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v27, v27, v12, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v26, v26, v11, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v25, v25, v10, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v24, v24, v9, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v23, v23, v8, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v22, v22, v7, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v13, v21, v6, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v11, v20, v5, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v19, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v18, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v17, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v16, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v15, v0, s[4:5]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -26103,8 +25912,8 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v22
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v22
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
 ; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
 ; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
@@ -26126,176 +25935,173 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX7-LABEL: v_select_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v3, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v16
-; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:16
-; GFX7-NEXT:    v_alignbit_b32 v6, v6, v11, 16
-; GFX7-NEXT:    v_alignbit_b32 v7, v7, v13, 16
-; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
-; GFX7-NEXT:    v_alignbit_b32 v8, v8, v15, 16
-; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_alignbit_b32 v5, v5, v9, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX7-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX7-NEXT:    v_alignbit_b32 v9, v9, v17, 16
-; GFX7-NEXT:    v_alignbit_b32 v25, v26, v25, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX7-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX7-NEXT:    v_alignbit_b32 v29, v30, v29, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX7-NEXT:    v_alignbit_b32 v19, v20, v19, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX7-NEXT:    v_alignbit_b32 v23, v24, v23, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX7-NEXT:    v_alignbit_b32 v27, v28, v27, 16
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
-; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
-; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100
-; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
-; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v10, 16
+; GFX7-NEXT:    v_alignbit_b32 v6, v6, v12, 16
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_alignbit_b32 v7, v7, v14, 16
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
+; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX7-NEXT:    v_alignbit_b32 v24, v25, v24, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    v_alignbit_b32 v20, v21, v20, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT:    v_alignbit_b32 v28, v29, v28, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT:    v_alignbit_b32 v18, v19, v18, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    v_alignbit_b32 v22, v23, v22, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT:    v_alignbit_b32 v26, v27, v26, 16
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    s_waitcnt vmcnt(12)
-; GFX7-NEXT:    v_alignbit_b32 v10, v10, v11, 16
-; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(10)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX7-NEXT:    s_waitcnt vmcnt(6)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(11)
+; GFX7-NEXT:    v_alignbit_b32 v9, v9, v10, 16
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    v_alignbit_b32 v10, v10, v11, 16
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
 ; GFX7-NEXT:    v_alignbit_b32 v11, v11, v12, 16
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX7-NEXT:    v_alignbit_b32 v12, v12, v13, 16
-; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GFX7-NEXT:    v_alignbit_b32 v13, v13, v14, 16
-; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v4, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
 ; GFX7-NEXT:    v_alignbit_b32 v14, v14, v15, 16
-; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v11, v2, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GFX7-NEXT:    v_alignbit_b32 v15, v15, v16, 16
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
-; GFX7-NEXT:    v_cndmask_b32_e32 v15, v15, v6, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v6, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
 ; GFX7-NEXT:    v_alignbit_b32 v16, v16, v17, 16
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:68
-; GFX7-NEXT:    v_cndmask_b32_e32 v16, v16, v7, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v13, v4, vcc
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v16, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v12, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v10, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v0, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX7-NEXT:    v_alignbit_b32 v17, v17, v18, 16
-; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    v_cndmask_b32_e32 v17, v17, v8, vcc
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT:    v_alignbit_b32 v18, v18, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v9, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v9, v14, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v12, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v2, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v17
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v17
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_alignbit_b32 v20, v20, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX7-NEXT:    v_alignbit_b32 v17, v17, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v17, v8, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v19, v19, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_alignbit_b32 v22, v22, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX7-NEXT:    v_alignbit_b32 v21, v21, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, v21, v20, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_alignbit_b32 v24, v24, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX7-NEXT:    v_alignbit_b32 v23, v23, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, v23, v22, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_alignbit_b32 v26, v26, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT:    v_cndmask_b32_e32 v25, v26, v25, vcc
+; GFX7-NEXT:    v_alignbit_b32 v25, v25, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_cndmask_b32_e64 v25, v25, v24, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_alignbit_b32 v28, v28, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT:    v_cndmask_b32_e32 v27, v28, v27, vcc
+; GFX7-NEXT:    v_alignbit_b32 v27, v27, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cndmask_b32_e64 v27, v27, v26, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_alignbit_b32 v30, v30, v31, 16
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_cndmask_b32_e32 v29, v30, v29, vcc
+; GFX7-NEXT:    v_alignbit_b32 v29, v29, v31, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v28, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX7-NEXT:    v_alignbit_b32 v31, v31, v32, 16
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    v_alignbit_b32 v30, v31, v30, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX7-NEXT:    v_alignbit_b32 v32, v32, v33, 16
-; GFX7-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT:    v_alignbit_b32 v31, v31, v32, 16
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v30, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
 ; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -26303,103 +26109,93 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX8-LABEL: v_select_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v16, v15, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v16, v15, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index 42e9dce374776..df1e4b4c58916 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -45,13 +45,13 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:               ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i8_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i8_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -107,12 +107,12 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -168,12 +168,12 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -228,13 +228,13 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -289,13 +289,13 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -350,14 +350,14 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -412,15 +412,15 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -475,19 +475,19 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s16, s18
-; GFX9-NEXT:    s_mov_b32 s17, s19
-; GFX9-NEXT:    s_mov_b32 s18, s20
-; GFX9-NEXT:    s_mov_b32 s19, s21
+; GFX9-NEXT:    s_mov_b32 s23, s21
+; GFX9-NEXT:    s_mov_b32 s22, s20
+; GFX9-NEXT:    s_mov_b32 s21, s19
+; GFX9-NEXT:    s_mov_b32 s20, s18
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[22:23]
-; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT:    s_getpc_b64 s[24:25]
+; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -553,12 +553,12 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -614,12 +614,12 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_bf16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_bf16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_bf16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -675,12 +675,12 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -735,13 +735,13 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -797,12 +797,12 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -859,12 +859,12 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2bf16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2bf16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2bf16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -919,13 +919,13 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -980,13 +980,13 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v4f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v4f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1041,13 +1041,13 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p0_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p0_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p0_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1102,13 +1102,13 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p1_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p1_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1164,12 +1164,12 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p3_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p3_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p3_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1224,15 +1224,15 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p1_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p1_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1287,13 +1287,13 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2p5_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2p5_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p5_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1348,16 +1348,16 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s16, s18
+; GFX9-NEXT:    s_mov_b32 s20, s18
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[20:21]
-; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    s_getpc_b64 s[22:23]
+; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
@@ -1412,23 +1412,24 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, vcc
 ; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s17
-; GFX9-NEXT:    s_mov_b32 s2, s16
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s16, s18
-; GFX9-NEXT:    s_mov_b32 s17, s19
-; GFX9-NEXT:    s_mov_b32 s18, s20
-; GFX9-NEXT:    s_mov_b32 s19, s21
-; GFX9-NEXT:    s_mov_b32 s20, s22
-; GFX9-NEXT:    s_mov_b32 s21, s23
-; GFX9-NEXT:    s_mov_b32 s22, s24
-; GFX9-NEXT:    s_mov_b32 s23, s25
-; GFX9-NEXT:    s_mov_b32 s24, s26
-; GFX9-NEXT:    s_mov_b32 s25, s27
-; GFX9-NEXT:    s_mov_b32 s26, s28
+; GFX9-NEXT:    s_mov_b32 s30, s28
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s29, s27
+; GFX9-NEXT:    s_mov_b32 s28, s26
+; GFX9-NEXT:    s_mov_b32 s27, s25
+; GFX9-NEXT:    s_mov_b32 s26, s24
+; GFX9-NEXT:    s_mov_b32 s25, s23
+; GFX9-NEXT:    s_mov_b32 s24, s22
+; GFX9-NEXT:    s_mov_b32 s23, s21
+; GFX9-NEXT:    s_mov_b32 s22, s20
+; GFX9-NEXT:    s_mov_b32 s21, s19
+; GFX9-NEXT:    s_mov_b32 s20, s18
+; GFX9-NEXT:    s_mov_b32 s19, s17
+; GFX9-NEXT:    s_mov_b32 s18, s16
+; GFX9-NEXT:    s_mov_b32 s17, s7
+; GFX9-NEXT:    s_mov_b32 s16, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s30
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 vcc
 ; GFX9-NEXT:    s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg at rel32@lo+4
@@ -1513,22 +1514,6 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s3, s7
-; GFX9-NEXT:    s_mov_b32 s2, s6
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, s8
-; GFX9-NEXT:    s_mov_b32 s5, s9
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
-; GFX9-NEXT:    s_mov_b32 s8, s15
-; GFX9-NEXT:    s_mov_b32 s9, s16
-; GFX9-NEXT:    s_mov_b32 s10, s17
-; GFX9-NEXT:    s_mov_b32 s11, s18
-; GFX9-NEXT:    s_mov_b32 s15, s19
-; GFX9-NEXT:    s_mov_b32 s16, s20
-; GFX9-NEXT:    s_mov_b32 s17, s21
-; GFX9-NEXT:    s_mov_b32 s18, s22
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[24:25]
 ; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 87e17a1c82080..10d39366ca56d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT:    v_mov_b32_e32 v0, 1
+; VI-NEXT:    s_mov_b64 s[4:5], -1
 ; VI-NEXT:    s_mov_b32 s32, 0
-; VI-NEXT:    s_getpc_b64 s[4:5]
-; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_getpc_b64 s[6:7]
+; VI-NEXT:    s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: test_call_external_void_func_i1_imm:
@@ -89,12 +89,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    s_mov_b64 s[4:5], -1
 ; CI-NEXT:    s_mov_b32 s32, 0
-; CI-NEXT:    s_getpc_b64 s[4:5]
-; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_getpc_b64 s[6:7]
+; CI-NEXT:    s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_call_external_void_func_i1_imm:
@@ -107,23 +107,23 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_imm:
@@ -131,14 +131,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    v_mov_b32_e32 v0, 1
+; HSA-NEXT:    s_mov_b64 s[4:5], -1
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_getpc_b64 s[4:5]
-; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    s_getpc_b64 s[6:7]
+; HSA-NEXT:    s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_i1(i1 true)
   ret void
@@ -160,11 +160,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
-; VI-NEXT:    s_getpc_b64 s[4:5]
-; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; VI-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_getpc_b64 s[6:7]
+; VI-NEXT:    s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; VI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: test_call_external_void_func_i1_signext:
@@ -182,11 +183,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
-; CI-NEXT:    s_getpc_b64 s[4:5]
-; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; CI-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_getpc_b64 s[6:7]
+; CI-NEXT:    s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; CI-NEXT:    v_and_b32_e32 v0, 1, v0
+; CI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_call_external_void_func_i1_signext:
@@ -204,11 +206,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_signext:
@@ -218,11 +221,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_signext:
@@ -237,12 +242,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[4:5]
-; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; HSA-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    s_getpc_b64 s[6:7]
+; HSA-NEXT:    s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; HSA-NEXT:    v_and_b32_e32 v0, 1, v0
+; HSA-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; HSA-NEXT:    s_endpgm
+
   %var = load volatile i1, ptr addrspace(1) undef
   call void @external_void_func_i1_signext(i1 signext %var)
   ret void
@@ -265,11 +272,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
-; VI-NEXT:    s_getpc_b64 s[4:5]
-; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; VI-NEXT:    s_getpc_b64 s[6:7]
+; VI-NEXT:    s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
 ; VI-NEXT:    v_and_b32_e32 v0, 1, v0
-; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; VI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: test_call_external_void_func_i1_zeroext:
@@ -287,11 +295,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
-; CI-NEXT:    s_getpc_b64 s[4:5]
-; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; CI-NEXT:    s_getpc_b64 s[6:7]
+; CI-NEXT:    s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
 ; CI-NEXT:    v_and_b32_e32 v0, 1, v0
-; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
@@ -309,11 +318,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
@@ -323,11 +333,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_zeroext:
@@ -342,11 +354,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[4:5]
-; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; HSA-NEXT:    s_getpc_b64 s[6:7]
+; HSA-NEXT:    s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
 ; HSA-NEXT:    v_and_b32_e32 v0, 1, v0
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; HSA-NEXT:    s_endpgm
   %var = load volatile i1, ptr addrspace(1) undef
   call void @external_void_func_i1_zeroext(i1 zeroext %var)
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 10d71a315fbf9..66a04ed26ddb7 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -11,8 +11,7 @@ define i1 @test1(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i32 %arg2, 1000
@@ -25,8 +24,7 @@ define i1 @test2(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 1000
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -39,8 +37,7 @@ define i1 @test3(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e9, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, 1000
   %cmp2 = icmp sle i32 %arg2, 1000
@@ -53,8 +50,7 @@ define i1 @test4(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e9, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, 1000
   %cmp2 = icmp ule i32 %arg2, 1000
@@ -67,8 +63,7 @@ define i1 @test5(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, 1000
   %cmp2 = icmp sgt i32 %arg2, 1000
@@ -81,8 +76,7 @@ define i1 @test6(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, 1000
   %cmp2 = icmp ugt i32 %arg2, 1000
@@ -95,8 +89,7 @@ define i1 @test7(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, 0x3e7, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, 1000
   %cmp2 = icmp sge i32 %arg2, 1000
@@ -109,8 +102,7 @@ define i1 @test8(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, 0x3e7, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, 1000
   %cmp2 = icmp uge i32 %arg2, 1000
@@ -123,8 +115,7 @@ define i1 @test9(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -137,8 +128,7 @@ define i1 @test10(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -151,8 +141,7 @@ define i1 @test11(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, %arg3
   %cmp2 = icmp sle i32 %arg2, %arg3
@@ -165,8 +154,7 @@ define i1 @test12(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -179,8 +167,7 @@ define i1 @test13(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -193,8 +180,7 @@ define i1 @test14(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, %arg3
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -207,8 +193,7 @@ define i1 @test15(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, %arg3
   %cmp2 = icmp sge i32 %arg2, %arg3
@@ -221,8 +206,7 @@ define i1 @test16(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, %arg3
   %cmp2 = icmp uge i32 %arg2, %arg3
@@ -235,8 +219,7 @@ define i1 @test17(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i32 %arg2, 1000
@@ -249,8 +232,7 @@ define i1 @test18(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 1000
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -263,8 +245,7 @@ define i1 @test19(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e9, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, 1000
   %cmp2 = icmp sle i32 %arg2, 1000
@@ -277,8 +258,7 @@ define i1 @test20(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e9, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, 1000
   %cmp2 = icmp ule i32 %arg2, 1000
@@ -291,8 +271,7 @@ define i1 @test21(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, 1000
   %cmp2 = icmp sgt i32 %arg2, 1000
@@ -305,8 +284,7 @@ define i1 @test22(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, 1000
   %cmp2 = icmp ugt i32 %arg2, 1000
@@ -319,8 +297,7 @@ define i1 @test23(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, 0x3e7, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, 1000
   %cmp2 = icmp sge i32 %arg2, 1000
@@ -333,8 +310,7 @@ define i1 @test24(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, 0x3e7, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, 1000
   %cmp2 = icmp uge i32 %arg2, 1000
@@ -347,8 +323,7 @@ define i1 @test25(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -361,8 +336,7 @@ define i1 @test26(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -375,8 +349,7 @@ define i1 @test27(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, %arg3
   %cmp2 = icmp sle i32 %arg2, %arg3
@@ -389,8 +362,7 @@ define i1 @test28(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -403,8 +375,7 @@ define i1 @test29(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -417,8 +388,7 @@ define i1 @test30(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, %arg3
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -431,8 +401,7 @@ define i1 @test31(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, %arg3
   %cmp2 = icmp sge i32 %arg2, %arg3
@@ -445,8 +414,7 @@ define i1 @test32(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, %arg3
   %cmp2 = icmp uge i32 %arg2, %arg3
@@ -459,8 +427,7 @@ define i1 @test33(i32 %arg1, i32 %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v1, 0x3e8, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v1, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg2
   %cmp2 = icmp slt i32 %arg1, 1000
@@ -633,8 +600,7 @@ define i1 @test42(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg3, %arg1
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -647,8 +613,7 @@ define i1 @test43(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg3, %arg1
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -661,8 +626,7 @@ define i1 @test44(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg3, %arg2
@@ -675,8 +639,7 @@ define i1 @test45(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg3, %arg2
@@ -689,8 +652,7 @@ define i1 @test46(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg3, %arg1
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -703,8 +665,7 @@ define i1 @test47(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg3, %arg2
@@ -717,8 +678,7 @@ define i1 @test48(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg3, %arg2
@@ -731,8 +691,7 @@ define i1 @test49(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg3, %arg1
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -745,8 +704,7 @@ define i1 @test50(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg3, %arg1
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -759,8 +717,7 @@ define i1 @test51(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg3, %arg2
@@ -773,8 +730,7 @@ define i1 @test52(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg3, %arg2
@@ -787,8 +743,7 @@ define i1 @test53(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg3, %arg1
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -801,8 +756,7 @@ define i1 @test54(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp olt float %arg2, %arg3
@@ -815,8 +769,7 @@ define i1 @test55(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ole double %arg1, %arg3
   %cmp2 = fcmp ole double %arg2, %arg3
@@ -829,8 +782,7 @@ define i1 @test56(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt double %arg1, %arg3
   %cmp2 = fcmp ogt double %arg2, %arg3
@@ -843,8 +795,7 @@ define i1 @test57(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp oge float %arg1, %arg3
   %cmp2 = fcmp oge float %arg2, %arg3
@@ -857,16 +808,14 @@ define i1 @test58(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test58:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ugt double %arg1, %arg3
   %cmp2 = fcmp ugt double %arg2, %arg3
@@ -879,16 +828,14 @@ define i1 @test59(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test59:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp uge float %arg1, %arg3
   %cmp2 = fcmp uge float %arg2, %arg3
@@ -901,16 +848,14 @@ define i1 @test60(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test60:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ule float %arg1, %arg3
   %cmp2 = fcmp ule float %arg2, %arg3
@@ -923,16 +868,14 @@ define i1 @test61(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test61:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -946,8 +889,7 @@ define i1 @test62(float %arg1, float %arg2, float %arg3) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -964,8 +906,7 @@ define i1 @test63(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -982,8 +923,7 @@ define i1 @test64(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -999,8 +939,7 @@ define i1 @test65(float %arg1, float %arg2, float %arg3) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1017,8 +956,7 @@ define i1 @test66(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1034,8 +972,7 @@ define i1 @test67(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1051,8 +988,7 @@ define i1 @test68(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1069,8 +1005,7 @@ define i1 @test69(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1086,16 +1021,14 @@ define i1 @test70(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test70:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1112,8 +1045,7 @@ define i1 @test71(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1130,8 +1062,7 @@ define i1 @test72(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1147,16 +1078,14 @@ define i1 @test73(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test73:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1173,8 +1102,7 @@ define i1 @test74(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test74:
@@ -1183,8 +1111,7 @@ define i1 @test74(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1200,16 +1127,14 @@ define i1 @test75(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test75:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1225,16 +1150,14 @@ define i1 @test76(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test76:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1251,8 +1174,7 @@ define i1 @test77(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test77:
@@ -1261,8 +1183,7 @@ define i1 @test77(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1277,8 +1198,7 @@ define i1 @test78(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp ogt float %arg3, %arg2
@@ -1291,16 +1211,14 @@ define i1 @test79(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test79:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %arg3
   %cmp2 = fcmp ugt float %arg3, %arg2
@@ -1314,8 +1232,7 @@ define i1 @test80(float %arg1, float %arg2, float %arg3) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1332,8 +1249,7 @@ define i1 @test81(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1350,8 +1266,7 @@ define i1 @test82(double %arg1, double %arg2, double %arg3) {
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1367,16 +1282,14 @@ define i1 @test83(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test83:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1393,16 +1306,14 @@ define i1 @test84(half %arg1, half %arg2, half %arg3) {
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f16_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test84:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f16_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1487,16 +1398,14 @@ define i1 @test87(half %arg1, half %arg2, half %arg3) {
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test87:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f16_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1547,16 +1456,14 @@ define i1 @test89(half %arg1, half %arg2, half %arg3) {
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f16_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test89:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f16_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1573,16 +1480,14 @@ define i1 @test90(half %arg1, half %arg2, half %arg3) {
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test90:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f16_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1631,8 +1536,7 @@ define i1 @test92(i32 %arg1, i32 %arg2, i32 %arg3, i32 %C) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1651,7 +1555,6 @@ define i1 @test93(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v1, v4
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1671,8 +1574,7 @@ define i1 @test94(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %ar
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GCN-NEXT:    v_min3_u32 v0, v5, v6, v0
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1697,8 +1599,7 @@ define i1 @test95(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_maxmin_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1713,8 +1614,7 @@ define i1 @test96(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_minmax_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1730,8 +1630,7 @@ define i1 @test97(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_max3_u32 v0, v0, v2, v3
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1749,8 +1648,7 @@ define i1 @test98(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_minmax_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1768,8 +1666,7 @@ define i1 @test99(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1787,8 +1684,7 @@ define i1 @test100(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_maxmin_u32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1807,8 +1703,7 @@ define i1 @test101(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_minmax_u32 v1, v3, v4, v5
 ; GCN-NEXT:    v_min3_u32 v0, v0, v2, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v6
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1831,8 +1726,7 @@ define i1 @test102(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_min_u32_e32 v1, v2, v3
 ; GCN-NEXT:    v_min3_u32 v0, v0, v5, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v6
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1859,7 +1753,6 @@ define i1 @test103(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v2, v6
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v6
 ; GCN-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1892,7 +1785,6 @@ define i1 @test104(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1931,7 +1823,6 @@ define i1 @test105(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
 ; GCN-NEXT:    s_and_b32 s0, s0, s1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1968,7 +1859,6 @@ define i1 @test106(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_or_b32 s0, s2, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C1
   %cmp2 = icmp ult i32 %arg2, %C1
@@ -2001,8 +1891,7 @@ define i1 @test107(float %arg1, float %arg2, float %arg3, float %C) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2017,16 +1906,14 @@ define i1 @test108(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
-; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test108:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v3
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %C
   %cmp2 = fcmp ult float %arg2, %C
@@ -2046,7 +1933,6 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test109:
@@ -2056,7 +1942,6 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
 ; GFX11NONANS-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2078,7 +1963,6 @@ define i1 @test110(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GCN-NEXT:    v_cmp_gt_f32_e64 s0, v1, v8
 ; GCN-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2099,12 +1983,12 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_min_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX11-NEXT:    v_max_f32_e32 v1, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min3_f32 v0, v5, v6, v0
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test111:
@@ -2114,8 +1998,7 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v5, v6, v0
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2141,13 +2024,13 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v4, v8
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_min_f32 v2, v2, v3
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX11-NEXT:    v_max_f32_e32 v3, v6, v6
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT:    v_min3_f32 v0, v0, v5, v3
+; GFX11-NEXT:    v_max_f32_e32 v1, v5, v5
+; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test112:
@@ -2157,8 +2040,7 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v5, v6, v0
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2187,15 +2069,13 @@ define i1 @test113(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v3
 ; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test113:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v3
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %C
   %cmp2 = fcmp ult float %arg2, %C
@@ -2214,7 +2094,6 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
 ; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test114:
@@ -2224,7 +2103,6 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
 ; GFX11NONANS-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt float %arg1, %C
   %cmp2 = fcmp ogt float %arg2, %C
@@ -2244,7 +2122,6 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v1, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test115:
@@ -2252,8 +2129,7 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v4
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2287,7 +2163,6 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test116:
@@ -2304,7 +2179,6 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11NONANS-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2348,7 +2222,6 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_or_b32 s0, s2, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test117:
@@ -2366,7 +2239,6 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s2, s0
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C1
   %cmp2 = fcmp olt float %arg2, %C1
@@ -2403,8 +2275,7 @@ define i1 @test118(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_max3_f32 v0, v0, v2, v3
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2428,8 +2299,7 @@ define i1 @test119(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_minmax_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2453,8 +2323,7 @@ define i1 @test120(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2478,8 +2347,7 @@ define i1 @test121(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2500,8 +2368,7 @@ define i1 @test122(double %arg1, double %arg2, double %arg3) #1 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -2516,8 +2383,7 @@ define i1 @test123(double %arg1, double %arg2, double %arg3) #1 {
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -2536,7 +2402,6 @@ define i1 @test124(i32 %arg1, i64 %arg2) {
 ; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0x3e8, v[1:2]
 ; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i64 %arg2, 1000
@@ -2551,7 +2416,6 @@ define i1 @test125(i32 %arg1, i32 %arg2) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp eq i32 %arg1, 1000
   %cmp2 = icmp eq i32 %arg2, 1000
@@ -2566,7 +2430,6 @@ define i1 @test126(i32 %arg1, i32 %arg2) {
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ne i32 %arg1, 1000
   %cmp2 = icmp ne i32 %arg2, 1000
@@ -2581,7 +2444,6 @@ define i1 @test127(i64 %arg1, i64 %arg2, i64 %arg3) {
 ; GCN-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[4:5]
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %cmp1 = icmp ult i64 %arg1, %arg3
    %cmp2 = icmp ult i64 %arg2, %arg3
@@ -2596,7 +2458,6 @@ define i1 @test128(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v2, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -2611,7 +2472,6 @@ define i1 @test129(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cmp_le_u32_e64 s0, v1, v2
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -2626,7 +2486,6 @@ define i1 @test130(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v2, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v1, v2
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -2641,7 +2500,6 @@ define i1 @test131(i16 %arg1, i32 %arg2) {
 ; GCN-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 10, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 10, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i16 %arg1, 10
   %cmp2 = icmp ult i32 %arg2, 10
@@ -2659,7 +2517,6 @@ define i1 @test132(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    s_or_b32 s1, s1, vcc_lo
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -2677,7 +2534,6 @@ define i1 @test133(i32 %arg1, i32 %arg2) {
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 100
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -2692,15 +2548,13 @@ define i1 @test134(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test134:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp ogt float %arg3, %arg2
@@ -2715,15 +2569,13 @@ define i1 @test135(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nle_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test135:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %arg3
   %cmp2 = fcmp ugt float %arg3, %arg2
@@ -2740,7 +2592,6 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, v[4:5], v[2:3]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test136:
@@ -2749,8 +2600,7 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -2768,15 +2618,13 @@ define i1 @test137(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test137:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -2793,15 +2641,13 @@ define i1 @test138(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test138:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp olt float %arg2, %arg3
@@ -2816,15 +2662,13 @@ define i1 @test139(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_le_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test139:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ole double %arg1, %arg3
   %cmp2 = fcmp ole double %arg2, %arg3
@@ -2839,15 +2683,13 @@ define i1 @test140(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test140:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt double %arg1, %arg3
   %cmp2 = fcmp ogt double %arg2, %arg3
@@ -2862,15 +2704,13 @@ define i1 @test141(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test141:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp oge float %arg1, %arg3
   %cmp2 = fcmp oge float %arg2, %arg3
@@ -2885,15 +2725,13 @@ define i1 @test142(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test142:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ugt double %arg1, %arg3
   %cmp2 = fcmp ugt double %arg2, %arg3
@@ -2908,15 +2746,13 @@ define i1 @test143(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test143:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp uge float %arg1, %arg3
   %cmp2 = fcmp uge float %arg2, %arg3
@@ -2931,15 +2767,13 @@ define i1 @test144(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test144:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ule float %arg1, %arg3
   %cmp2 = fcmp ule float %arg2, %arg3
@@ -2954,15 +2788,13 @@ define i1 @test145(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test145:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -2978,15 +2810,13 @@ define i1 @test146(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test146:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3005,7 +2835,6 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_le_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test147:
@@ -3014,8 +2843,7 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3034,7 +2862,6 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test148:
@@ -3043,8 +2870,7 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3062,15 +2888,13 @@ define i1 @test149(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test149:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3089,7 +2913,6 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test150:
@@ -3098,8 +2921,7 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3117,15 +2939,13 @@ define i1 @test151(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test151:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_ge_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3143,15 +2963,13 @@ define i1 @test152(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test152:
 ; GFX11NONANS:       ; %bb.0:
 ; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_le_f32_e64 s0, v0, v2
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3170,7 +2988,6 @@ define i1 @test153(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test153:
@@ -3179,8 +2996,7 @@ define i1 @test153(double %arg1, double %arg2, double %arg3) {
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
index eecc91239c728..279819165f33c 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
@@ -5,20 +5,19 @@ define void @wombat(i1 %cond, ptr addrspace(5) %addr) {
 ; CHECK-LABEL: wombat:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT:    buffer_load_ubyte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %then
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB0_2: ; %end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+
 entry:
   %load = load <1 x i8>, ptr addrspace(5) %addr, align 1
   br i1 %cond, label %then, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index c3a6cd5975a77..53448df79ee27 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -34,19 +34,17 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
 define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i16_to_i1
   ; GCN: bb.0 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3
+  ; GCN-NEXT:   liveins: $vgpr2, $sgpr4_sgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
-  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
   ; GCN-NEXT:   [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY1]], 0, 16, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_BFE_I32_e64_]], killed [[S_MOV_B32_]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %setcc = icmp slt i16 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
@@ -86,18 +84,16 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x
 define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i32_to_i1
   ; GCN: bb.0 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3
+  ; GCN-NEXT:   liveins: $vgpr2, $sgpr4_sgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
-  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY1]], killed [[S_MOV_B32_]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %setcc = icmp slt i32 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
@@ -141,21 +137,19 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
 define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i64_to_i1
   ; GCN: bb.0 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4
+  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $sgpr4_sgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
-  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
   ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE]], [[COPY3]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], [[COPY]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   $sgpr0_sgpr1 = COPY [[COPY2]]
+  ; GCN-NEXT:   SI_RETURN implicit $sgpr0_sgpr1
   %setcc = icmp slt i64 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
   ret i1 %select
diff --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 72ee660dc2adb..02a3066822e51 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -29,6 +29,8 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
 ; CHECK-NEXT:    buffer_store_byte v2, off, s[0:3], s32 offset:1
 ; CHECK-NEXT:    buffer_load_ubyte v0, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %val = load <8 x i1>, ptr %ptr
   %ret = extractelement <8 x i1> %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 17f67615c29f2..40cbae2af9259 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2837,10 +2837,8 @@ define float @v_fneg_select_infloop_regression_f32(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = fneg float %i
@@ -2852,10 +2850,8 @@ define float @v_fneg_select_infloop_regression_f32_commute0(float %arg, i1 %arg1
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 0.0
   %i2 = fneg float %i
@@ -2867,10 +2863,8 @@ define float @v_fneg_select_infloop_regression_f32_commute1(float %arg, i1 %arg1
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = fneg float %i
@@ -2882,10 +2876,8 @@ define float @v_fneg_select_infloop_regression_f32_commute2(float %arg, i1 %arg1
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 0.0
   %i2 = fneg float %i
@@ -2898,10 +2890,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32(float %arg, i1 %ar
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 2.0, float %arg
   %i2 = fneg float %i
@@ -2913,10 +2903,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute0(float %ar
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 2.0
   %i2 = fneg float %i
@@ -2928,10 +2916,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute1(float %ar
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 2.0, float %arg
   %i2 = fneg float %i
@@ -2943,10 +2929,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute2(float %ar
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 2.0
   %i2 = fneg float %i
@@ -2959,10 +2943,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32(float %arg, i1
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float -2.0, float %arg
   %i2 = fneg float %i
@@ -2974,10 +2956,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0(float
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float -2.0
   %i2 = fneg float %i
@@ -2989,10 +2969,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1(float
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float -2.0, float %arg
   %i2 = fneg float %i
@@ -3004,10 +2982,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float -2.0
   %i2 = fneg float %i
@@ -3066,12 +3042,10 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_bfrev_b32_e32 v3, 1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, double 0.0, double %arg
   %i2 = fneg double %i
@@ -3123,21 +3097,17 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_and_b32_e32 v1, 1, v1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_select_infloop_regression_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 1, v1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, half 0.0, half %arg
   %i2 = fneg half %i
@@ -3190,11 +3160,9 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_and_b32_e32 v1, 1, v2
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -3203,11 +3171,9 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
 ; VI-LABEL: v_fneg_select_infloop_regression_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 1, v1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %arg
   %i2 = fneg <2 x half> %i
@@ -3264,13 +3230,11 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1
 ; GCN-LABEL: v_fneg_select_infloop_regression_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_bfrev_b32_e32 v3, 1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg
   %i2 = fneg <2 x float> %i
@@ -3317,10 +3281,8 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fabs_select_infloop_regression_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = call float @llvm.fabs.f32(float %i)
@@ -3368,10 +3330,8 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_fabs_select_infloop_regression:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v0|, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v0|, 0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = call float @llvm.fabs.f32(float %i)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 9a8ddb5bd3831..8b0b0c785bbb5 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -7,18 +7,13 @@ define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v1, -v0, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v1, -v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 %arg0, i32 %arg1
   %fneg = xor i32 %select, -2147483648
@@ -57,10 +52,8 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
 ; GFX7-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    flat_store_dword v[3:4], v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX7-NEXT:    flat_store_dword v[2:3], v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -68,10 +61,8 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
 ; GFX9-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    global_store_dword v[3:4], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -79,12 +70,10 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
 ; GFX11-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
-; GFX11-NEXT:    global_store_b32 v[3:4], v1, off
+; GFX11-NEXT:    global_store_b32 v[2:3], v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 %arg0, i32 %arg1
   store i32 %select, ptr addrspace(1) %ptr
@@ -96,20 +85,15 @@ define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, -v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, -v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
   %fneg = xor i64 %select, 9223372036854775808
@@ -152,19 +136,15 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
@@ -231,10 +211,8 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 ; GFX7-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    flat_store_short v[3:4], v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX7-NEXT:    flat_store_short v[2:3], v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -242,10 +220,8 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 ; GFX9-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    global_store_short v[3:4], v0, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -253,12 +229,10 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 ; GFX11-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v1
-; GFX11-NEXT:    global_store_b16 v[3:4], v1, off
+; GFX11-NEXT:    global_store_b16 v[2:3], v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
   store i16 %select, ptr addrspace(1) %ptr
@@ -270,38 +244,34 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
 ; GFX7-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[5:6], v[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX7-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v3, -v6, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX9-NEXT:    global_store_dwordx2 v[5:6], v[0:1], off
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v3, -v6, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, -v4, -v2, vcc_lo
-; GFX11-NEXT:    global_store_b64 v[5:6], v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, -v3, -v6, s0
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
@@ -314,30 +284,21 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
 ; GCN-LABEL: select_fneg_xor_select_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i32 %arg0, -2147483648
   %select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -350,25 +311,16 @@ define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %a
 ; GCN-LABEL: select_fneg_select_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg float %arg0
   %select0 = select i1 %cond0, float %arg1, float %fneg0
@@ -381,20 +333,15 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
 ; GCN-LABEL: fneg_xor_select_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, -v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, -v1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, double %arg0, double %arg1
   %fneg = fneg double %select
@@ -405,12 +352,9 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX7-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[5:6], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX7-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -418,12 +362,9 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX9-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX9-NEXT:    global_store_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -431,13 +372,11 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX11-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
-; GFX11-NEXT:    global_store_b64 v[5:6], v[0:1], off
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, double %arg0, double %arg1
@@ -450,21 +389,18 @@ define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %a
 ; GCN-LABEL: fneg_xor_select_i64_user_with_srcmods:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT:    v_add_f64 v[0:1], -v[1:2], 2.0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT:    v_add_f64 v[0:1], -v[0:1], 2.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64_user_with_srcmods:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_add_f64 v[0:1], -v[1:2], 2.0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], -v[0:1], 2.0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
   %fneg = xor i64 %select, 9223372036854775808
@@ -477,32 +413,23 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
 ; GCN-LABEL: select_fneg_select_fneg_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_fneg_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg double %arg0
   %select0 = select i1 %cond0, double %arg1, double %fneg0
@@ -515,32 +442,23 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
 ; GCN-LABEL: select_fneg_xor_select_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i64 %arg0, 9223372036854775808
   %select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
@@ -553,45 +471,32 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
 ; GFX7-LABEL: select_fneg_select_f16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e64 v2, -v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, s[6:7]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: select_fneg_select_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg half %arg0
   %select0 = select i1 %cond0, half %arg1, half %fneg0
@@ -604,30 +509,21 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
 ; GCN-LABEL: select_fneg_xor_select_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i16 %arg0, -32768
   %select0 = select i1 %cond0, i16 %arg1, i16 %fneg0



More information about the llvm-commits mailing list