[llvm] [AMDGPU] Allocate i1 argument to SGPRs (PR #72461)

Jun Wang via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 21 15:48:08 PST 2023


https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/72461

>From bf53dc281335730b26aed7eae37d0717ea94f9fa Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 19:48:41 -0600
Subject: [PATCH 1/5] [AMDGPU] Allocate i1 argument to SGPRs

Currently i1 arguments are passed as 32-bit VGPRs. It would make more
sense to make use of SGPRs and pass these values as a wavesize bool mask.
---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td |  5 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   | 13 +++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      | 23 +++++++++
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp  |  6 +++
 llvm/test/CodeGen/AMDGPU/z_callee.ll        | 33 ++++++++++++
 llvm/test/CodeGen/AMDGPU/z_caller.ll        | 43 ++++++++++++++++
 llvm/test/CodeGen/AMDGPU/z_caller2.ll       | 57 +++++++++++++++++++++
 7 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 9036b26a6f6bcb..3f18cbd661d5b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -185,9 +185,12 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
 // Calling convention for leaf functions
 def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
-  CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
+  CCIfType<[i1] , CCAssignToReg<
+    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
+  >>,
+
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 708f212e204acf..04d1dde9bf1bc6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3479,6 +3479,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
 
+  // In code below (after call of AnalyzeCallOperands),
+  // if (!Subtarget->enableFlatScratch()), it would use either s[48:51] or
+  // s[0:3]. Therefore, before calling AnalyzeCallOperands, we may need to
+  // reserve these registers.
+  if (!Subtarget->enableFlatScratch()) {
+    if (IsChainCallConv)
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+          AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+    else
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+          AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+  }
+
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4746b559d9256..837f36ebe6d847 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -854,6 +854,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      // When calling convention allocates SGPR for i1 argument, we may
+      // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
+      // the copy to avoid illegal copy.
+      if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
+        auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
+        if (sub0 != DestReg)
+          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
+        return;
+      }
+
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -887,6 +897,19 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+      // When an i1 argument is allocated to an SGPR_32, we may have a COPY
+      // from SGPR_32 to SReg_64. The following handles this case to avoid
+      // an illegal copy.
+      if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+        auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
+        if (sub0 != SrcReg) {
+          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
+        }
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+        return;
+      }
+
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 68c8f4024e7300..189333b063e522 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -513,6 +513,12 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
+      // When the calling convention allocates i1 argument to SGPR,
+      // we may have a COPY with dst being an SGPR_32. This should
+      // not be lowered into V_CNDMASK_B32.
+      if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+        continue;
+
       Changed = true;
 
       // Copy into a 32-bit vector register.
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
new file mode 100644
index 00000000000000..2fc4befa279f3e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
+; Therefore, the "s_mov_b32 s5, 0" is generated.
+;
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s5, 0
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
new file mode 100644
index 00000000000000..faf25e407fca2c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1(i1) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+  call void @external_void_func_i1(i1 true)
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
new file mode 100644
index 00000000000000..e63ae50b7e91cd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1_signext(i1 signext) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT:    s_endpgm
+  %var = load volatile i1, ptr addrspace(1) undef
+  call void @external_void_func_i1_signext(i1 signext %var)
+  ret void
+}
+
+
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

>From 4af46db5a0731101d91da545a967f0f233c360fc Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 20:37:27 -0600
Subject: [PATCH 2/5] Fix format.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  | 11 +++++++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp     | 13 +++++++------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp |  2 +-
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 04d1dde9bf1bc6..61e2855113ba23 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3485,11 +3485,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   // reserve these registers.
   if (!Subtarget->enableFlatScratch()) {
     if (IsChainCallConv)
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
-          AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+      CCInfo.AllocateRegBlock(
+          ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,
+                              AMDGPU::SGPR51},
+          4);
     else
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
-          AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+                                                  AMDGPU::SGPR2, AMDGPU::SGPR3},
+                              4);
   }
 
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 837f36ebe6d847..1ccca2adac9433 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -869,7 +869,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc));
+        .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
@@ -884,13 +884,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     if (DestReg == AMDGPU::VCC) {
       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addReg(SrcReg, getKillRegState(KillSrc));
       } else {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(KillSrc));
       }
 
       return;
@@ -900,13 +900,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       // When an i1 argument is allocated to an SGPR_32, we may have a COPY
       // from SGPR_32 to SReg_64. The following handles this case to avoid
       // an illegal copy.
-      if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+      if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
         auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
         if (sub0 != SrcReg) {
           BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
         }
         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+                RI.getSubReg(DestReg, AMDGPU::sub1))
+            .addImm(0);
         return;
       }
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 189333b063e522..161ea3ddc0435c 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -516,7 +516,7 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
       // When the calling convention allocates i1 argument to SGPR,
       // we may have a COPY with dst being an SGPR_32. This should
       // not be lowered into V_CNDMASK_B32.
-      if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+      if (AMDGPU::SGPR_32RegClass.contains(DstReg))
         continue;
 
       Changed = true;

>From 072ba02588343f49d6e51f00171d410c1a5a131a Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 12:31:17 -0600
Subject: [PATCH 3/5] Creating a custom calling conv function for i1.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |  9 +--
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 31 +++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  9 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 24 ------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    | 13 +--
 llvm/test/CodeGen/AMDGPU/z_callee.ll          |  7 +-
 llvm/test/CodeGen/AMDGPU/z_caller.ll          |  6 +-
 llvm/test/CodeGen/AMDGPU/z_caller2.ll         |  4 +-
 llvm/test/CodeGen/AMDGPU/z_return.ll          | 80 +++++++++++++++++++
 9 files changed, 137 insertions(+), 46 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 3f18cbd661d5b2..954145f092ed7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,9 +187,7 @@ def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  CCIfType<[i1] , CCAssignToReg<
-    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
-  >>,
+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
 
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
@@ -205,8 +203,9 @@ def CC_AMDGPU_Func : CallingConv<[
 
 // Calling convention for leaf functions
 def RetCC_AMDGPU_Func : CallingConv<[
-  CCIfType<[i1], CCPromoteToType<i32>>,
-  CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9d7443012e3da3..5ed7bd5417e3a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,6 +29,37 @@
 
 using namespace llvm;
 
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
+                           MVT LocVT, CCValAssign::LocInfo LocInfo,
+                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+
+  static const MCPhysReg I1RegList1[] = {
+    AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
+    AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
+    AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+    AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+    AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
+  };
+
+  static const MCPhysReg I1RegList2[] = {
+    AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
+    AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
+    AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
+    AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
+    AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
+    AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
+    AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
+  };
+
+  assert (LocVT == MVT::i1);
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+  return false; // not allocated
+}
+
 #include "AMDGPUGenCallingConv.inc"
 
 static cl::opt<bool> AMDGPUBypassSlowDiv(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 61e2855113ba23..6eda46120a8172 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2831,8 +2831,13 @@ SDValue SITargetLowering::LowerFormalArguments(
       RC = &AMDGPU::VGPR_32RegClass;
     else if (AMDGPU::SGPR_32RegClass.contains(Reg))
       RC = &AMDGPU::SGPR_32RegClass;
-    else
-      llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+    else {
+      if (VT == MVT::i1 && Subtarget->isWave64())
+        RC = &AMDGPU::SGPR_64RegClass;
+      else
+        llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+    }
+
     EVT ValVT = VA.getValVT();
 
     Reg = MF.addLiveIn(Reg, RC);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1ccca2adac9433..b28e88b3711e84 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -854,16 +854,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
-      // When calling convention allocates SGPR for i1 argument, we may
-      // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
-      // the copy to avoid illegal copy.
-      if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
-        auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
-        if (sub0 != DestReg)
-          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
-        return;
-      }
-
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -897,20 +887,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
-      // When an i1 argument is allocated to an SGPR_32, we may have a COPY
-      // from SGPR_32 to SReg_64. The following handles this case to avoid
-      // an illegal copy.
-      if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
-        auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
-        if (sub0 != SrcReg) {
-          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
-        }
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                RI.getSubReg(DestReg, AMDGPU::sub1))
-            .addImm(0);
-        return;
-      }
-
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 161ea3ddc0435c..546176c69dd7a4 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -513,12 +513,6 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
-      // When the calling convention allocates i1 argument to SGPR,
-      // we may have a COPY with dst being an SGPR_32. This should
-      // not be lowered into V_CNDMASK_B32.
-      if (AMDGPU::SGPR_32RegClass.contains(DstReg))
-        continue;
-
       Changed = true;
 
       // Copy into a 32-bit vector register.
@@ -721,6 +715,13 @@ bool SILowerI1Copies::lowerCopiesToI1() {
       assert(!MI.getOperand(1).getSubReg());
 
       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
+        if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
+          // return value is put in 64b SGPR.
+          assert(ST->isWave64());
+          continue;
+        }
+
         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
         unsigned TmpReg = createLaneMaskReg(*MF);
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
index 2fc4befa279f3e..44af2c90f900b3 100644
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 define void @void_func_i1(i1 %arg0) #0 {
 ; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
@@ -11,7 +11,6 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s5, 0
 ; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
index faf25e407fca2c..f9203cf078e47c 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 
 declare hidden void @external_void_func_i1(i1) #0
@@ -17,7 +17,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
index e63ae50b7e91cd..1141476960250a 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
 
 declare hidden void @external_void_func_i1_signext(i1 signext) #0
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
new file mode 100644
index 00000000000000..6bf64da7a1b8ff
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_return.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define i1 @i1_func_void() #0 {
+  %val = load i1, ptr addrspace(1) undef
+  ret i1 %val
+}
+
+define void @test_call_i1_func_void() #0 {
+; CIGFX89-LABEL: test_call_i1_func_void:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s6, s33
+; CIGFX89-NEXT:    s_mov_b32 s33, s32
+; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT:    s_addk_i32 s32, 0x400
+; CIGFX89-NEXT:    s_getpc_b64 s[4:5]
+; CIGFX89-NEXT:    s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
+; CIGFX89-NEXT:    s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
+; CIGFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIGFX89-NEXT:    v_writelane_b32 v1, s30, 0
+; CIGFX89-NEXT:    v_writelane_b32 v1, s31, 1
+; CIGFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; CIGFX89-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CIGFX89-NEXT:    global_store_byte v[2:3], v0, off
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    v_readlane_b32 s31, v1, 1
+; CIGFX89-NEXT:    v_readlane_b32 s30, v1, 0
+; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT:    buffer_load_dword v1, off, s[0:3], s33  ; 4-byte Folded Reload
+; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT:    s_addk_i32 s32, 0xfc00
+; CIGFX89-NEXT:    s_mov_b32 s33, s6
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v1, s33          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v1, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    global_store_b8 v[2:3], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v1, off, s33           ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+
+  %val = call i1 @i1_func_void()
+  store volatile i1 %val, ptr addrspace(1) undef
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+

>From 9aa2b86846453055a97e3a4349cfdcf22711467d Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 20:04:19 -0600
Subject: [PATCH 4/5] Fix formatting.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 44 ++++++++++---------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    |  7 +--
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5ed7bd5417e3a4..ac8432e75e6c5a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,31 +29,33 @@
 
 using namespace llvm;
 
-static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
-                           MVT LocVT, CCValAssign::LocInfo LocInfo,
-                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                CCValAssign::LocInfo LocInfo,
+                                ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static bool IsWave64 = static_cast<const GCNSubtarget &>(
+                             State.getMachineFunction().getSubtarget())
+                             .isWave64();
 
   static const MCPhysReg I1RegList1[] = {
-    AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
-    AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
-    AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
-    AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
-    AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
-  };
+      AMDGPU::SGPR0_SGPR1,   AMDGPU::SGPR2_SGPR3,   AMDGPU::SGPR4_SGPR5,
+      AMDGPU::SGPR6_SGPR7,   AMDGPU::SGPR8_SGPR9,   AMDGPU::SGPR10_SGPR11,
+      AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+      AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+      AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
 
   static const MCPhysReg I1RegList2[] = {
-    AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
-    AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
-    AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
-    AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
-    AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
-    AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
-    AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
-  };
-
-  assert (LocVT == MVT::i1);
-  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+      AMDGPU::SGPR0,  AMDGPU::SGPR1,  AMDGPU::SGPR2,  AMDGPU::SGPR3,
+      AMDGPU::SGPR4,  AMDGPU::SGPR5,  AMDGPU::SGPR6,  AMDGPU::SGPR7,
+      AMDGPU::SGPR8,  AMDGPU::SGPR9,  AMDGPU::SGPR10, AMDGPU::SGPR11,
+      AMDGPU::SGPR12, AMDGPU::SGPR13, AMDGPU::SGPR14, AMDGPU::SGPR15,
+      AMDGPU::SGPR16, AMDGPU::SGPR17, AMDGPU::SGPR18, AMDGPU::SGPR19,
+      AMDGPU::SGPR20, AMDGPU::SGPR21, AMDGPU::SGPR22, AMDGPU::SGPR23,
+      AMDGPU::SGPR24, AMDGPU::SGPR25, AMDGPU::SGPR26, AMDGPU::SGPR27,
+      AMDGPU::SGPR28, AMDGPU::SGPR29};
+
+  assert(LocVT == MVT::i1);
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
+                              : State.AllocateReg(I1RegList2)) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 546176c69dd7a4..211250fac4eea7 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -715,9 +715,10 @@ bool SILowerI1Copies::lowerCopiesToI1() {
       assert(!MI.getOperand(1).getSubReg());
 
       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
-        if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
-          // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
-          // return value is put in 64b SGPR.
+        if (!SrcReg.isVirtual() &&
+            TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with
+          // wavefront size 64, i1 return value is put in 64b SGPR.
           assert(ST->isWave64());
           continue;
         }

>From 0602c5dd1a92fe9f19ca8d6404af5a867db1e325 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 21 Dec 2023 16:13:47 -0600
Subject: [PATCH 5/5] Fixed (1) problems for global-isel wrt both incoming args
 and return value (2) a problem in AMDCallingConv.td when no sgprs are
 available.

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  20 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  13 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   2 +-
 .../irtranslator-call-return-values.ll        |  20 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |  18 +-
 .../GlobalISel/irtranslator-function-args.ll  | 246 ++++++++++--
 .../GlobalISel/irtranslator-invariant.ll      |   6 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |  48 +--
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |   6 +-
 ...amdgpu-codegenprepare-fold-binop-select.ll | 278 +++++++-------
 llvm/test/CodeGen/AMDGPU/function-args.ll     | 363 +++++++++++++++---
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |   5 +
 llvm/test/CodeGen/AMDGPU/z_callee.ll          |  32 --
 llvm/test/CodeGen/AMDGPU/z_caller.ll          |  43 ---
 llvm/test/CodeGen/AMDGPU/z_caller2.ll         |  57 ---
 llvm/test/CodeGen/AMDGPU/z_return.ll          |  80 ----
 17 files changed, 744 insertions(+), 495 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index cf2896f80f19bb..0ec295c63fb694 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -124,7 +124,15 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
     if (VA.getLocVT().getSizeInBits() < 32) {
       // 16-bit types are reported as legal for 32-bit registers. We need to do
       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
-      auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+      unsigned CopyToBits = 32;
+
+      // When function return type is i1, it may be in a 64b register.
+      if (VA.getLocVT().getSizeInBits() == 1) {
+        if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
+          CopyToBits = 64;
+      }
+
+      auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
 
       // If we have signext/zeroext, it applies to the whole 32-bit register
       // before truncation.
@@ -233,7 +241,15 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
-    Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+    Register ExtReg;
+
+    if (VA.getLocVT().getSizeInBits() == 1 &&
+        MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+      ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
+    } else {
+      ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+    }
+
     MIRBuilder.buildCopy(PhysReg, ExtReg);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 954145f092ed7a..4db89fc75531d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -189,6 +189,8 @@ def CC_AMDGPU_Func : CallingConv<[
 
   CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
 
+  CCIfType<[i1], CCPromoteToType<i32>>,
+
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ac8432e75e6c5a..ed654c50582ee9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -32,18 +32,17 @@ using namespace llvm;
 static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
                                 CCValAssign::LocInfo LocInfo,
                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static bool IsWave64 = static_cast<const GCNSubtarget &>(
-                             State.getMachineFunction().getSubtarget())
-                             .isWave64();
+  static bool IsWave64 =
+      State.getMachineFunction().getSubtarget<GCNSubtarget>().isWave64();
 
-  static const MCPhysReg I1RegList1[] = {
+  static const MCPhysReg SGPRArgsWave64[] = {
       AMDGPU::SGPR0_SGPR1,   AMDGPU::SGPR2_SGPR3,   AMDGPU::SGPR4_SGPR5,
       AMDGPU::SGPR6_SGPR7,   AMDGPU::SGPR8_SGPR9,   AMDGPU::SGPR10_SGPR11,
       AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
       AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
       AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
 
-  static const MCPhysReg I1RegList2[] = {
+  static const MCPhysReg SGPRArgsWave32[] = {
       AMDGPU::SGPR0,  AMDGPU::SGPR1,  AMDGPU::SGPR2,  AMDGPU::SGPR3,
       AMDGPU::SGPR4,  AMDGPU::SGPR5,  AMDGPU::SGPR6,  AMDGPU::SGPR7,
       AMDGPU::SGPR8,  AMDGPU::SGPR9,  AMDGPU::SGPR10, AMDGPU::SGPR11,
@@ -54,8 +53,8 @@ static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
       AMDGPU::SGPR28, AMDGPU::SGPR29};
 
   assert(LocVT == MVT::i1);
-  if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
-                              : State.AllocateReg(I1RegList2)) {
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64)
+                              : State.AllocateReg(SGPRArgsWave32)) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6eda46120a8172..e9687a0621f7c0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2833,7 +2833,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       RC = &AMDGPU::SGPR_32RegClass;
     else {
       if (VT == MVT::i1 && Subtarget->isWave64())
-        RC = &AMDGPU::SGPR_64RegClass;
+        RC = Subtarget->getBoolRC();
       else
         llvm_unreachable("Unexpected register class in LowerFormalArguments!");
     }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 609883c190223a..b8e46802516179 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -200,9 +200,9 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -278,10 +278,9 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY21]], 1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
   ; GCN-NEXT:   G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
@@ -340,10 +339,9 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
   ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY21]], 1
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
   ; GCN-NEXT:   G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 2ede504223cb82..133e490b0026b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -354,8 +354,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ANYEXT]](s64)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -367,7 +367,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   call void @external_void_func_i1(i1 true)
@@ -413,8 +413,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[SEXT]](s64)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -426,7 +426,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   %var = load volatile i1, ptr addrspace(1) undef
@@ -473,8 +473,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
   ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
+  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK-NEXT:   $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -486,7 +486,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
   %var = load volatile i1, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 062d2e173e6d8a..f6d5858b46f6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -3,6 +3,7 @@
 ; the frame info, so some functions have manually added stack object
 ; checks.
 ; RUN: llc -march=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=GFX1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
 ; FIXME: pre-VI should have same ABI without legal i16 operations.
 
 define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
@@ -34,10 +35,10 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
 define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
@@ -48,11 +49,10 @@ define void @void_func_i1(i1 %arg0) #0 {
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 1
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
@@ -68,11 +68,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 1
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
@@ -89,10 +88,10 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-LABEL: name: i1_arg_i1_use
   ; CHECK: bb.1.bb:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -1986,25 +1985,25 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28
   ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
-  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5)
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
-  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s1) from %fixed-stack.3, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
-  ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
-  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.2, align 8, addrspace 5)
-  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD2]](s16)
-  ; CHECK-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
-  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.1, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
-  ; CHECK-NEXT:   [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
+  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
+  ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.1, align 8, addrspace 5)
+  ; CHECK-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
   ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[LOAD2]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD4]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -2777,4 +2776,197 @@ define void @vector_ptr_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x ptr addrspa
   ret void
 }
 
+; Check calling convention for i1 args
+define void @many_i1_args(
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; CHECK-LABEL: name: many_i1_args
+; CHECK: bb.1 (%ir-block.0):
+; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
+; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
+; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; CHECK-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
+; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
+; CHECK-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
+; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+; CHECK-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
+; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
+; CHECK-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
+; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
+; CHECK-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
+; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
+; CHECK-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
+; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
+; CHECK-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
+; CHECK-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s64)
+; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
+; CHECK-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s64)
+; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; CHECK-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr8
+; CHECK-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; CHECK-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr9
+; CHECK-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; CHECK-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr10
+; CHECK-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; CHECK-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr11
+; CHECK-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; CHECK-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr12
+; CHECK-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; CHECK-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr13
+; CHECK-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; CHECK-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr14
+; CHECK-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; CHECK-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr15
+; CHECK-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr16
+; CHECK-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; CHECK:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+;
+; GFX11-LABEL: name: many_i1_args
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: {{  $}}
+; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT:   [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
+; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT:   [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
+; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT:   [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
+; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT:   [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
+; GFX11-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT:   [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; GFX11-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT:   [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; GFX11-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT:   [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; GFX11-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT:   [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; GFX11-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT:   [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; GFX11-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT:   [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT:   [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT:   [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT:   [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT:   [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT:   [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT:   [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT:   [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT:   [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT:   [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT:   [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT:   [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT:   [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT:   [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT:   [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT:   [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX11-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT:   [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX11-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT:   [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT:   [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT:   [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11:        G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i1 %arg2, ptr addrspace(1) undef
+  store volatile i1 %arg3, ptr addrspace(1) undef
+  store volatile i1 %arg4, ptr addrspace(1) undef
+  store volatile i1 %arg5, ptr addrspace(1) undef
+  store volatile i1 %arg6, ptr addrspace(1) undef
+  store volatile i1 %arg7, ptr addrspace(1) undef
+
+  store volatile i1 %arg8, ptr addrspace(1) undef
+  store volatile i1 %arg9, ptr addrspace(1) undef
+  store volatile i1 %arg10, ptr addrspace(1) undef
+  store volatile i1 %arg11, ptr addrspace(1) undef
+  store volatile i1 %arg12, ptr addrspace(1) undef
+  store volatile i1 %arg13, ptr addrspace(1) undef
+  store volatile i1 %arg14, ptr addrspace(1) undef
+  store volatile i1 %arg15, ptr addrspace(1) undef
+
+  store volatile i1 %arg16, ptr addrspace(1) undef
+  store volatile i1 %arg17, ptr addrspace(1) undef
+  store volatile i1 %arg18, ptr addrspace(1) undef
+  store volatile i1 %arg19, ptr addrspace(1) undef
+  store volatile i1 %arg20, ptr addrspace(1) undef
+  store volatile i1 %arg21, ptr addrspace(1) undef
+  store volatile i1 %arg22, ptr addrspace(1) undef
+  store volatile i1 %arg23, ptr addrspace(1) undef
+
+  store volatile i1 %arg24, ptr addrspace(1) undef
+  store volatile i1 %arg25, ptr addrspace(1) undef
+  store volatile i1 %arg26, ptr addrspace(1) undef
+  store volatile i1 %arg27, ptr addrspace(1) undef
+  store volatile i1 %arg28, ptr addrspace(1) undef
+  store volatile i1 %arg29, ptr addrspace(1) undef
+  store volatile i1 %arg30, ptr addrspace(1) undef
+  store volatile i1 %arg31, ptr addrspace(1) undef
+
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index 7c7a600c4bda2b..f9e717d1424e28 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -22,10 +22,10 @@ define i32 @load_const_i32_gv() {
 define i32 @load_select_const_i32_gv(i1 %cond) {
   ; CHECK-LABEL: name: load_select_const_i32_gv
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
   ; CHECK-NEXT:   [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
   ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[TRUNC]](s1), [[GV]], [[GV1]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index a5482bd5b79a96..c3b8a6b2b75263 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -10,8 +10,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -19,8 +19,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -28,32 +28,32 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f32:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W64-LABEL: v_div_fmas_f32:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W32-LABEL: v_div_fmas_f32:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W64-LABEL: v_div_fmas_f32:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX11_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W64-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
@@ -64,8 +64,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX7-NEXT:    s_and_b32 s4, 1, s0
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -73,8 +73,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX8-LABEL: v_div_fmas_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX8-NEXT:    s_and_b32 s4, 1, s0
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -82,32 +82,32 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX10_W32-LABEL: v_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W64-LABEL: v_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W32-LABEL: v_div_fmas_f64:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX11_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W64-LABEL: v_div_fmas_f64:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX11_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX11_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11_W64-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index da9601a8998c2b..39b4a0992c9b22 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,9 +168,9 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-LABEL: localize_internal_globals:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    s_and_b32 s4, 1, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 5c40a4ce13e31a..9beec51710598e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -10,11 +10,10 @@ define i32 @select_sdiv_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1e848
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x30d40
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0x30d40
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1e848
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 8
   %op = sdiv i32 1000000, %select
@@ -29,11 +28,10 @@ define i32 @select_sdiv_rhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x2710
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x3e8
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_movk_i32 s6, 0x3e8
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x2710
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 42000, i32 420000
   %op = sdiv i32 %select, 42
@@ -48,11 +46,10 @@ define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_const_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x22b
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x29a
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_movk_i32 s6, 0x29a
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x22b
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x594
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, <2 x i32> <i32 5, i32 undef>, <2 x i32> <i32 6, i32 7>
@@ -68,14 +65,13 @@ define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_const_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3661c
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x307dd
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x23b02a
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x13e3a0c
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_mov_b32 s6, 0x307dd
+; GCN-NEXT:    s_mov_b32 s5, 0x13e3a0c
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x3661c
+; GCN-NEXT:    s_cselect_b32 s5, s5, 0x23b02a
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, <2 x i32> <i32 8342123, i32 834212353>, <2 x i32> <i32 9355456, i32 93554321>
   %op = sdiv <2 x i32> %select, <i32 42, i32 40>
@@ -126,40 +122,41 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_opaque_const0_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v1, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_cselect_b32 s4, s6, 5
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_sub_i32 s6, 0, s4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT:    s_mov_b32 s6, 0xf4240
+; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT:    s_mov_b32 s4, 0xf4240
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    s_mul_i32 s6, s6, s4
+; GCN-NEXT:    s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT:    s_sub_i32 s7, s6, s4
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s6, s7, s6
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, s5, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
   %op = sdiv i32 1000000, %select
@@ -208,40 +205,41 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_lhs_opaque_const1_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 5, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_cselect_b32 s4, 5, s6
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_sub_i32 s6, 0, s4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT:    s_mov_b32 s6, 0xf4240
+; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT:    s_mov_b32 s4, 0xf4240
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    s_mul_i32 s6, s6, s4
+; GCN-NEXT:    s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT:    s_sub_i32 s7, s6, s4
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s6, s7, s6
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, s5, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
   %op = sdiv i32 1000000, %select
@@ -257,18 +255,15 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_opaque_const0_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x392fa
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x30c30c31
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT:    v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x392fa
+; GCN-NEXT:    v_mul_hi_i32 v0, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -287,18 +282,15 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
 ; GCN-LABEL: select_sdiv_rhs_opaque_const1_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xa410
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x30c30c31
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT:    v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT:    s_cselect_b32 s4, 0xa410, s6
+; GCN-NEXT:    v_mul_hi_i32 v0, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -316,11 +308,10 @@ define i32 @select_add_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xf4248
-; GCN-NEXT:    v_mov_b32_e32 v2, 0xf4245
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0xf4245
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0xf4248
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 8
   %op = add i32 1000000, %select
@@ -335,11 +326,9 @@ define float @select_fadd_lhs_const_i32_fmf(i1 %cond) {
 ; GCN-LABEL: select_fadd_lhs_const_i32_fmf:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x40a00000
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x40400000
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40a00000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, float 2.0, float 4.0
   %op = fadd nnan nsz float 1.0, %select
@@ -351,12 +340,10 @@ define i32 @select_mul_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_mul_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_movk_i32 s6, 0x1388
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; IR-LABEL: @select_mul_lhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
@@ -370,12 +357,10 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_mul_rhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_movk_i32 s6, 0x1388
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; IR-LABEL: @select_mul_rhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
@@ -411,9 +396,7 @@ define i16 @select_add_trunc_select(i1 %cond) {
 ; GCN-LABEL: select_add_trunc_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; IR-LABEL: @select_add_trunc_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
@@ -432,9 +415,9 @@ define i32 @select_add_sext_select(i1 %cond) {
 ; GCN-LABEL: select_add_sext_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 29, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, 29, 50
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 -13, i16 8
   %trunc = sext i16 %select to i32
@@ -450,9 +433,9 @@ define i32 @select_add_zext_select(i1 %cond) {
 ; GCN-LABEL: select_add_zext_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, 47, 50
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 5, i16 8
   %trunc = zext i16 %select to i32
@@ -468,11 +451,10 @@ define i32 @select_add_bitcast_select(i1 %cond) {
 ; GCN-LABEL: select_add_bitcast_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000002a
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x3f80002a
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_mov_b32 s6, 0x3f80002a
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0x4000002a
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, float 1.0, float 2.0
   %trunc = bitcast float %select to i32
@@ -493,10 +475,8 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
 ; GCN-LABEL: multi_use_cast_regression:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c00
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_sub_f32_e32 v1, 1.0, v0
 ; GCN-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 01dcc26566663b..9c8259d32dae55 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -8,7 +8,7 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -18,7 +18,7 @@ define void @void_func_i1(i1 %arg0) #0 {
 ; GFX11-LABEL: void_func_i1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -31,6 +31,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1_zeroext:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CIGFX89-NEXT:    v_or_b32_e32 v0, 12, v0
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
@@ -41,9 +42,11 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 ; GFX11-LABEL: void_func_i1_zeroext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i1 %arg0 to i32
@@ -56,7 +59,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; CI-LABEL: void_func_i1_signext:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, 12, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -66,7 +70,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; VI-LABEL: void_func_i1_signext:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, 12, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -76,7 +81,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; GFX9-LABEL: void_func_i1_signext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v0, 12, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -86,9 +92,11 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; GFX11-LABEL: void_func_i1_signext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 12, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i1 %arg0 to i32
@@ -101,9 +109,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-LABEL: i1_arg_i1_use:
 ; CIGFX89:       ; %bb.0: ; %bb
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
-; CIGFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CIGFX89-NEXT:    s_xor_b64 s[6:7], vcc, -1
+; CIGFX89-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; CIGFX89-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
 ; CIGFX89-NEXT:    s_cbranch_execz .LBB3_2
 ; CIGFX89-NEXT:  ; %bb.1: ; %bb1
@@ -120,11 +126,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; GFX11-LABEL: i1_arg_i1_use:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT:    s_xor_b32 s1, s0, -1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s1
 ; GFX11-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb1
@@ -2775,12 +2779,11 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2789,15 +2792,15 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v20
-; CI-NEXT:    v_and_b32_e32 v0, 1, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2814,12 +2817,12 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2828,14 +2831,13 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, 1, v20
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2852,14 +2854,12 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2868,29 +2868,28 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_i1_i8_i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -2899,8 +2898,6 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
 ; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2909,7 +2906,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
@@ -4566,4 +4563,276 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
   ret void
 }
 
+define void @many_i1_args(
+  i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+  i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+  i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+  i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: many_i1_args:
+; GFX9:      ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, vcc
+; GFX9-NEXT:    v_writelane_b32 v19, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v19, s31, 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s31, 0xf000
+; GFX9-NEXT:    s_mov_b32 s30, -1
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[10:11]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[12:13]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[14:15]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[16:17]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[18:19]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[20:21]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[22:23]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[24:25]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[26:27]
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[28:29]
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
+; GFX9-NEXT:    buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v19, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v19, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: many_i1_args:
+; GFX11:      ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s32          ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT:    s_mov_b32 s30, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_mov_b32 s31, 0x31016000
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s11
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s16
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s21
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s26
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s29
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[28:31], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32           ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i1 %arg2, ptr addrspace(1) undef
+  store volatile i1 %arg3, ptr addrspace(1) undef
+  store volatile i1 %arg4, ptr addrspace(1) undef
+  store volatile i1 %arg5, ptr addrspace(1) undef
+  store volatile i1 %arg6, ptr addrspace(1) undef
+  store volatile i1 %arg7, ptr addrspace(1) undef
+
+  store volatile i1 %arg8, ptr addrspace(1) undef
+  store volatile i1 %arg9, ptr addrspace(1) undef
+  store volatile i1 %arg10, ptr addrspace(1) undef
+  store volatile i1 %arg11, ptr addrspace(1) undef
+  store volatile i1 %arg12, ptr addrspace(1) undef
+  store volatile i1 %arg13, ptr addrspace(1) undef
+  store volatile i1 %arg14, ptr addrspace(1) undef
+  store volatile i1 %arg15, ptr addrspace(1) undef
+
+  store volatile i1 %arg16, ptr addrspace(1) undef
+  store volatile i1 %arg17, ptr addrspace(1) undef
+  store volatile i1 %arg18, ptr addrspace(1) undef
+  store volatile i1 %arg19, ptr addrspace(1) undef
+  store volatile i1 %arg20, ptr addrspace(1) undef
+  store volatile i1 %arg21, ptr addrspace(1) undef
+  store volatile i1 %arg22, ptr addrspace(1) undef
+  store volatile i1 %arg23, ptr addrspace(1) undef
+
+  store volatile i1 %arg24, ptr addrspace(1) undef
+  store volatile i1 %arg25, ptr addrspace(1) undef
+  store volatile i1 %arg26, ptr addrspace(1) undef
+  store volatile i1 %arg27, ptr addrspace(1) undef
+  store volatile i1 %arg28, ptr addrspace(1) undef
+  store volatile i1 %arg29, ptr addrspace(1) undef
+  store volatile i1 %arg30, ptr addrspace(1) undef
+  store volatile i1 %arg31, ptr addrspace(1) undef
+
+  ret void
+}
+
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index e7d86c0c178e97..f725f26ccca146 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -12,6 +12,8 @@ define i1 @i1_func_void() #0 {
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_func_void:
@@ -21,6 +23,9 @@ define i1 @i1_func_void() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
deleted file mode 100644
index 44af2c90f900b3..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define void @void_func_i1(i1 %arg0) #0 {
-; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
-; Therefore, the "s_mov_b32 s5, 0" is generated.
-;
-; CIGFX89-LABEL: void_func_i1:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: void_func_i1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i1 %arg0, ptr addrspace(1) undef
-  ret void
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
deleted file mode 100644
index f9203cf078e47c..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1(i1) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b64 s[4:5], -1
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_imm:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s0, -1
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_endpgm
-  call void @external_void_func_i1(i1 true)
-  ret void
-}
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
deleted file mode 100644
index 1141476960250a..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1_signext(i1 signext) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_signext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_signext:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GFX11-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11-NEXT:    s_mov_b32 s0, s2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX11-NEXT:    s_endpgm
-  %var = load volatile i1, ptr addrspace(1) undef
-  call void @external_void_func_i1_signext(i1 signext %var)
-  ret void
-}
-
-
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
deleted file mode 100644
index 6bf64da7a1b8ff..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_return.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define i1 @i1_func_void() #0 {
-  %val = load i1, ptr addrspace(1) undef
-  ret i1 %val
-}
-
-define void @test_call_i1_func_void() #0 {
-; CIGFX89-LABEL: test_call_i1_func_void:
-; CIGFX89: ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s6, s33
-; CIGFX89-NEXT:    s_mov_b32 s33, s32
-; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
-; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT:    s_addk_i32 s32, 0x400
-; CIGFX89-NEXT:    s_getpc_b64 s[4:5]
-; CIGFX89-NEXT:    s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
-; CIGFX89-NEXT:    s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
-; CIGFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CIGFX89-NEXT:    v_writelane_b32 v1, s30, 0
-; CIGFX89-NEXT:    v_writelane_b32 v1, s31, 1
-; CIGFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; CIGFX89-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CIGFX89-NEXT:    global_store_byte v[2:3], v0, off
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    v_readlane_b32 s31, v1, 1
-; CIGFX89-NEXT:    v_readlane_b32 s30, v1, 0
-; CIGFX89-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT:    buffer_load_dword v1, off, s[0:3], s33  ; 4-byte Folded Reload
-; CIGFX89-NEXT:    s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT:    s_addk_i32 s32, 0xfc00
-; CIGFX89-NEXT:    s_mov_b32 s33, s6
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_i1_func_void:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v1, s33          ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    global_store_b8 v[2:3], v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v1, off, s33           ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-
-  %val = call i1 @i1_func_void()
-  store volatile i1 %val, ptr addrspace(1) undef
-  ret void
-}
-
-attributes #0 = { nounwind }
-
-



More information about the llvm-commits mailing list