[llvm] [AMDGPU] Allocate i1 argument to SGPRs (PR #72461)
Jun Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 21 15:48:08 PST 2023
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/72461
>From bf53dc281335730b26aed7eae37d0717ea94f9fa Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 19:48:41 -0600
Subject: [PATCH 1/5] [AMDGPU] Allocate i1 argument to SGPRs
Currently i1 arguments are passed as 32-bit VGPRs. It would make more
sense to make use of SGPRs and pass these values as a wavesize bool mask.
---
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 5 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 +++++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 23 +++++++++
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 6 +++
llvm/test/CodeGen/AMDGPU/z_callee.ll | 33 ++++++++++++
llvm/test/CodeGen/AMDGPU/z_caller.ll | 43 ++++++++++++++++
llvm/test/CodeGen/AMDGPU/z_caller2.ll | 57 +++++++++++++++++++++
7 files changed, 179 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 9036b26a6f6bcb..3f18cbd661d5b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -185,9 +185,12 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
// Calling convention for leaf functions
def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
- CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i1] , CCAssignToReg<
+ !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
+ >>,
+
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
>>>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 708f212e204acf..04d1dde9bf1bc6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3479,6 +3479,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
+ // In code below (after call of AnalyzeCallOperands),
+ // if (!Subtarget->enableFlatScratch()), it would use either s[48:51] or
+ // s[0:3]. Therefore, before calling AnalyzeCallOperands, we may need to
+ // reserve these registers.
+ if (!Subtarget->enableFlatScratch()) {
+ if (IsChainCallConv)
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+ AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+ else
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+ AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+ }
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4746b559d9256..837f36ebe6d847 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -854,6 +854,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+ // When calling convention allocates SGPR for i1 argument, we may
+ // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
+ // the copy to avoid illegal copy.
+ if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
+ auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
+ if (sub0 != DestReg)
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
+ return;
+ }
+
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -887,6 +897,19 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+ // When an i1 argument is allocated to an SGPR_32, we may have a COPY
+ // from SGPR_32 to SReg_64. The following handles this case to avoid
+ // an illegal copy.
+ if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+ auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
+ if (sub0 != SrcReg) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
+ }
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+ RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+ return;
+ }
+
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 68c8f4024e7300..189333b063e522 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -513,6 +513,12 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
continue;
+ // When the calling convention allocates i1 argument to SGPR,
+ // we may have a COPY with dst being an SGPR_32. This should
+ // not be lowered into V_CNDMASK_B32.
+ if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+ continue;
+
Changed = true;
// Copy into a 32-bit vector register.
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
new file mode 100644
index 00000000000000..2fc4befa279f3e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
+; Therefore, the "s_mov_b32 s5, 0" is generated.
+;
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s5, 0
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i1 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
new file mode 100644
index 00000000000000..faf25e407fca2c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1(i1) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s3
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s4, -1
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_getpc_b64 s[8:9]
+; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_imm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: s_endpgm
+ call void @external_void_func_i1(i1 true)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
new file mode 100644
index 00000000000000..e63ae50b7e91cd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1_signext(i1 signext) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s5
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_getpc_b64 s[8:9]
+; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_getpc_b64 s[4:5]
+; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT: s_endpgm
+ %var = load volatile i1, ptr addrspace(1) undef
+ call void @external_void_func_i1_signext(i1 signext %var)
+ ret void
+}
+
+
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
>From 4af46db5a0731101d91da545a967f0f233c360fc Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 20:37:27 -0600
Subject: [PATCH 2/5] Fix format.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +++++++----
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 +++++++------
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 2 +-
3 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 04d1dde9bf1bc6..61e2855113ba23 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3485,11 +3485,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// reserve these registers.
if (!Subtarget->enableFlatScratch()) {
if (IsChainCallConv)
- CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
- AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+ CCInfo.AllocateRegBlock(
+ ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,
+ AMDGPU::SGPR51},
+ 4);
else
- CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
- AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+ AMDGPU::SGPR2, AMDGPU::SGPR3},
+ 4);
}
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 837f36ebe6d847..1ccca2adac9433 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -869,7 +869,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addReg(SrcReg, getKillRegState(KillSrc));
return;
}
@@ -884,13 +884,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addReg(SrcReg, getKillRegState(KillSrc));
} else {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
- .addImm(0)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
}
return;
@@ -900,13 +900,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// When an i1 argument is allocated to an SGPR_32, we may have a COPY
// from SGPR_32 to SReg_64. The following handles this case to avoid
// an illegal copy.
- if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+ if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
if (sub0 != SrcReg) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
}
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
- RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+ RI.getSubReg(DestReg, AMDGPU::sub1))
+ .addImm(0);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 189333b063e522..161ea3ddc0435c 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -516,7 +516,7 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
// When the calling convention allocates i1 argument to SGPR,
// we may have a COPY with dst being an SGPR_32. This should
// not be lowered into V_CNDMASK_B32.
- if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+ if (AMDGPU::SGPR_32RegClass.contains(DstReg))
continue;
Changed = true;
>From 072ba02588343f49d6e51f00171d410c1a5a131a Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 12:31:17 -0600
Subject: [PATCH 3/5] Creating a custom calling conv function for i1.
---
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 9 +--
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 31 +++++++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 ++-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 24 ------
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 13 +--
llvm/test/CodeGen/AMDGPU/z_callee.ll | 7 +-
llvm/test/CodeGen/AMDGPU/z_caller.ll | 6 +-
llvm/test/CodeGen/AMDGPU/z_caller2.ll | 4 +-
llvm/test/CodeGen/AMDGPU/z_return.ll | 80 +++++++++++++++++++
9 files changed, 137 insertions(+), 46 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 3f18cbd661d5b2..954145f092ed7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,9 +187,7 @@ def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i1] , CCAssignToReg<
- !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
- >>,
+ CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
@@ -205,8 +203,9 @@ def CC_AMDGPU_Func : CallingConv<[
// Calling convention for leaf functions
def RetCC_AMDGPU_Func : CallingConv<[
- CCIfType<[i1], CCPromoteToType<i32>>,
- CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+
CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9d7443012e3da3..5ed7bd5417e3a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,6 +29,37 @@
using namespace llvm;
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+
+ static const MCPhysReg I1RegList1[] = {
+ AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
+ AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
+ AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+ AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+ AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
+ };
+
+ static const MCPhysReg I1RegList2[] = {
+ AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
+ AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
+ AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
+ AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
+ AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
+ AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
+ AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
+ };
+
+ assert (LocVT == MVT::i1);
+ if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false; // not allocated
+}
+
#include "AMDGPUGenCallingConv.inc"
static cl::opt<bool> AMDGPUBypassSlowDiv(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 61e2855113ba23..6eda46120a8172 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2831,8 +2831,13 @@ SDValue SITargetLowering::LowerFormalArguments(
RC = &AMDGPU::VGPR_32RegClass;
else if (AMDGPU::SGPR_32RegClass.contains(Reg))
RC = &AMDGPU::SGPR_32RegClass;
- else
- llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+ else {
+ if (VT == MVT::i1 && Subtarget->isWave64())
+ RC = &AMDGPU::SGPR_64RegClass;
+ else
+ llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+ }
+
EVT ValVT = VA.getValVT();
Reg = MF.addLiveIn(Reg, RC);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1ccca2adac9433..b28e88b3711e84 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -854,16 +854,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
- // When calling convention allocates SGPR for i1 argument, we may
- // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
- // the copy to avoid illegal copy.
- if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
- auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
- if (sub0 != DestReg)
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
- return;
- }
-
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -897,20 +887,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
- // When an i1 argument is allocated to an SGPR_32, we may have a COPY
- // from SGPR_32 to SReg_64. The following handles this case to avoid
- // an illegal copy.
- if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
- auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
- if (sub0 != SrcReg) {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
- }
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
- RI.getSubReg(DestReg, AMDGPU::sub1))
- .addImm(0);
- return;
- }
-
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 161ea3ddc0435c..546176c69dd7a4 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -513,12 +513,6 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
continue;
- // When the calling convention allocates i1 argument to SGPR,
- // we may have a COPY with dst being an SGPR_32. This should
- // not be lowered into V_CNDMASK_B32.
- if (AMDGPU::SGPR_32RegClass.contains(DstReg))
- continue;
-
Changed = true;
// Copy into a 32-bit vector register.
@@ -721,6 +715,13 @@ bool SILowerI1Copies::lowerCopiesToI1() {
assert(!MI.getOperand(1).getSubReg());
if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
+ if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+ // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
+ // return value is put in 64b SGPR.
+ assert(ST->isWave64());
+ continue;
+ }
+
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
unsigned TmpReg = createLaneMaskReg(*MF);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
index 2fc4befa279f3e..44af2c90f900b3 100644
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
define void @void_func_i1(i1 %arg0) #0 {
; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
@@ -11,7 +11,6 @@ define void @void_func_i1(i1 %arg0) #0 {
; CIGFX89-LABEL: void_func_i1:
; CIGFX89: ; %bb.0:
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: s_mov_b32 s5, 0
; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
; CIGFX89-NEXT: s_mov_b32 s6, -1
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
index faf25e407fca2c..f9203cf078e47c 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
declare hidden void @external_void_func_i1(i1) #0
@@ -17,7 +17,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s4, -1
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[8:9]
; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
index e63ae50b7e91cd..1141476960250a 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
declare hidden void @external_void_func_i1_signext(i1 signext) #0
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
new file mode 100644
index 00000000000000..6bf64da7a1b8ff
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_return.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define i1 @i1_func_void() #0 {
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_i1_func_void() #0 {
+; CIGFX89-LABEL: test_call_i1_func_void:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s6, s33
+; CIGFX89-NEXT: s_mov_b32 s33, s32
+; CIGFX89-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; CIGFX89-NEXT: s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT: s_addk_i32 s32, 0x400
+; CIGFX89-NEXT: s_getpc_b64 s[4:5]
+; CIGFX89-NEXT: s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
+; CIGFX89-NEXT: s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
+; CIGFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIGFX89-NEXT: v_writelane_b32 v1, s30, 0
+; CIGFX89-NEXT: v_writelane_b32 v1, s31, 1
+; CIGFX89-NEXT: s_waitcnt lgkmcnt(0)
+; CIGFX89-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CIGFX89-NEXT: global_store_byte v[2:3], v0, off
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: v_readlane_b32 s31, v1, 1
+; CIGFX89-NEXT: v_readlane_b32 s30, v1, 0
+; CIGFX89-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; CIGFX89-NEXT: s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT: s_addk_i32 s32, 0xfc00
+; CIGFX89-NEXT: s_mov_b32 s33, s6
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_i1_func_void:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v1, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v1, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_readlane_b32 s31, v1, 1
+; GFX11-NEXT: v_readlane_b32 s30, v1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: global_store_b8 v[2:3], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+
+ %val = call i1 @i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+attributes #0 = { nounwind }
+
+
>From 9aa2b86846453055a97e3a4349cfdcf22711467d Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 20:04:19 -0600
Subject: [PATCH 4/5] Fix formatting.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 44 ++++++++++---------
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 7 +--
2 files changed, 27 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5ed7bd5417e3a4..ac8432e75e6c5a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,31 +29,33 @@
using namespace llvm;
-static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ static bool IsWave64 = static_cast<const GCNSubtarget &>(
+ State.getMachineFunction().getSubtarget())
+ .isWave64();
static const MCPhysReg I1RegList1[] = {
- AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
- AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
- AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
- AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
- AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
- };
+ AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
+ AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
+ AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+ AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+ AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
static const MCPhysReg I1RegList2[] = {
- AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
- AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
- AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
- AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
- AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
- AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
- AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
- };
-
- assert (LocVT == MVT::i1);
- if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+ AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3,
+ AMDGPU::SGPR4, AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7,
+ AMDGPU::SGPR8, AMDGPU::SGPR9, AMDGPU::SGPR10, AMDGPU::SGPR11,
+ AMDGPU::SGPR12, AMDGPU::SGPR13, AMDGPU::SGPR14, AMDGPU::SGPR15,
+ AMDGPU::SGPR16, AMDGPU::SGPR17, AMDGPU::SGPR18, AMDGPU::SGPR19,
+ AMDGPU::SGPR20, AMDGPU::SGPR21, AMDGPU::SGPR22, AMDGPU::SGPR23,
+ AMDGPU::SGPR24, AMDGPU::SGPR25, AMDGPU::SGPR26, AMDGPU::SGPR27,
+ AMDGPU::SGPR28, AMDGPU::SGPR29};
+
+ assert(LocVT == MVT::i1);
+ if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
+ : State.AllocateReg(I1RegList2)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 546176c69dd7a4..211250fac4eea7 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -715,9 +715,10 @@ bool SILowerI1Copies::lowerCopiesToI1() {
assert(!MI.getOperand(1).getSubReg());
if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
- if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
- // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
- // return value is put in 64b SGPR.
+ if (!SrcReg.isVirtual() &&
+ TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+ // When calling convention allocates SGPR for i1, for GPUs with
+ // wavefront size 64, i1 return value is put in 64b SGPR.
assert(ST->isWave64());
continue;
}
>From 0602c5dd1a92fe9f19ca8d6404af5a867db1e325 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 21 Dec 2023 16:13:47 -0600
Subject: [PATCH 5/5] Fixed (1) problems for global-isel wrt both incoming args
and return value (2) a problem in AMDCallingConv.td when no sgprs are
available.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 20 +-
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 2 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 13 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
.../irtranslator-call-return-values.ll | 20 +-
.../AMDGPU/GlobalISel/irtranslator-call.ll | 18 +-
.../GlobalISel/irtranslator-function-args.ll | 246 ++++++++++--
.../GlobalISel/irtranslator-invariant.ll | 6 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 48 +--
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 6 +-
...amdgpu-codegenprepare-fold-binop-select.ll | 278 +++++++-------
llvm/test/CodeGen/AMDGPU/function-args.ll | 363 +++++++++++++++---
llvm/test/CodeGen/AMDGPU/function-returns.ll | 5 +
llvm/test/CodeGen/AMDGPU/z_callee.ll | 32 --
llvm/test/CodeGen/AMDGPU/z_caller.ll | 43 ---
llvm/test/CodeGen/AMDGPU/z_caller2.ll | 57 ---
llvm/test/CodeGen/AMDGPU/z_return.ll | 80 ----
17 files changed, 744 insertions(+), 495 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index cf2896f80f19bb..0ec295c63fb694 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -124,7 +124,15 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
if (VA.getLocVT().getSizeInBits() < 32) {
// 16-bit types are reported as legal for 32-bit registers. We need to do
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
- auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+ unsigned CopyToBits = 32;
+
+ // When function return type is i1, it may be in a 64b register.
+ if (VA.getLocVT().getSizeInBits() == 1) {
+ if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
+ CopyToBits = 64;
+ }
+
+ auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
// If we have signext/zeroext, it applies to the whole 32-bit register
// before truncation.
@@ -233,7 +241,15 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+ Register ExtReg;
+
+ if (VA.getLocVT().getSizeInBits() == 1 &&
+ MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+ ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
+ } else {
+ ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+ }
+
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 954145f092ed7a..4db89fc75531d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -189,6 +189,8 @@ def CC_AMDGPU_Func : CallingConv<[
CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+ CCIfType<[i1], CCPromoteToType<i32>>,
+
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
>>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ac8432e75e6c5a..ed654c50582ee9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -32,18 +32,17 @@ using namespace llvm;
static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
- static bool IsWave64 = static_cast<const GCNSubtarget &>(
- State.getMachineFunction().getSubtarget())
- .isWave64();
+ static bool IsWave64 =
+ State.getMachineFunction().getSubtarget<GCNSubtarget>().isWave64();
- static const MCPhysReg I1RegList1[] = {
+ static const MCPhysReg SGPRArgsWave64[] = {
AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
- static const MCPhysReg I1RegList2[] = {
+ static const MCPhysReg SGPRArgsWave32[] = {
AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3,
AMDGPU::SGPR4, AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7,
AMDGPU::SGPR8, AMDGPU::SGPR9, AMDGPU::SGPR10, AMDGPU::SGPR11,
@@ -54,8 +53,8 @@ static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
AMDGPU::SGPR28, AMDGPU::SGPR29};
assert(LocVT == MVT::i1);
- if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
- : State.AllocateReg(I1RegList2)) {
+ if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64)
+ : State.AllocateReg(SGPRArgsWave32)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6eda46120a8172..e9687a0621f7c0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2833,7 +2833,7 @@ SDValue SITargetLowering::LowerFormalArguments(
RC = &AMDGPU::SGPR_32RegClass;
else {
if (VT == MVT::i1 && Subtarget->isWave64())
- RC = &AMDGPU::SGPR_64RegClass;
+ RC = Subtarget->getBoolRC();
else
llvm_unreachable("Unexpected register class in LowerFormalArguments!");
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 609883c190223a..b8e46802516179 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -200,9 +200,9 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
- ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+ ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GCN-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
@@ -278,10 +278,9 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
- ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY21]], 1
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+ ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
; GCN-NEXT: G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
@@ -340,10 +339,9 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
- ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY21]], 1
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+ ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GCN-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
; GCN-NEXT: G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 2ede504223cb82..133e490b0026b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -354,8 +354,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s1)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT]](s64)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -367,7 +367,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
call void @external_void_func_i1(i1 true)
@@ -413,8 +413,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+ ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[SEXT]](s64)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -426,7 +426,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
%var = load volatile i1, ptr addrspace(1) undef
@@ -473,8 +473,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -486,7 +486,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
%var = load volatile i1, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 062d2e173e6d8a..f6d5858b46f6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -3,6 +3,7 @@
; the frame info, so some functions have manually added stack object
; checks.
; RUN: llc -march=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=GFX1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
; FIXME: pre-VI should have same ABI without legal i16 operations.
define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
@@ -34,10 +35,10 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
define void @void_func_i1(i1 %arg0) #0 {
; CHECK-LABEL: name: void_func_i1
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -48,11 +49,10 @@ define void @void_func_i1(i1 %arg0) #0 {
define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
; CHECK-LABEL: name: void_func_i1_zeroext
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 1
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
@@ -68,11 +68,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
define void @void_func_i1_signext(i1 signext %arg0) #0 {
; CHECK-LABEL: name: void_func_i1_signext
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 1
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
@@ -89,10 +88,10 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CHECK-LABEL: name: i1_arg_i1_use
; CHECK: bb.1.bb:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -1986,25 +1985,25 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28
; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
; CHECK-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
- ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
- ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s1) from %fixed-stack.3, align 4, addrspace 5)
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
- ; CHECK-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.2, align 8, addrspace 5)
- ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD2]](s16)
- ; CHECK-NEXT: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.1, align 4, addrspace 5)
- ; CHECK-NEXT: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
- ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5)
+ ; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
+ ; CHECK-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.1, align 8, addrspace 5)
+ ; CHECK-NEXT: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
; CHECK-NEXT: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
+ ; CHECK-NEXT: G_STORE [[LOAD2]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: G_STORE [[LOAD3]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: G_STORE [[LOAD4]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
store volatile <32 x i32> %arg0, ptr addrspace(1) undef
store volatile i1 %arg1, ptr addrspace(1) undef
@@ -2777,4 +2776,197 @@ define void @vector_ptr_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x ptr addrspa
ret void
}
+; Check calling convention for i1 args
+define void @many_i1_args(
+ i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+ i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+ i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+ i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; CHECK-LABEL: name: many_i1_args
+; CHECK: bb.1 (%ir-block.0):
+; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; CHECK-NEXT: {{ $}}
+; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
+; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
+; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
+; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
+; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
+; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
+; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
+; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
+; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
+; CHECK-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
+; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
+; CHECK-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
+; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
+; CHECK-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
+; CHECK-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s64)
+; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
+; CHECK-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s64)
+; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr8
+; CHECK-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; CHECK-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr9
+; CHECK-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; CHECK-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr10
+; CHECK-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; CHECK-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr11
+; CHECK-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; CHECK-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr12
+; CHECK-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr13
+; CHECK-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr14
+; CHECK-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; CHECK-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr15
+; CHECK-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr16
+; CHECK-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; CHECK: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+;
+; GFX11-LABEL: name: many_i1_args
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
+; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
+; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
+; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
+; GFX11-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; GFX11-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; GFX11-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; GFX11-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX11-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX11-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ store volatile i1 %arg2, ptr addrspace(1) undef
+ store volatile i1 %arg3, ptr addrspace(1) undef
+ store volatile i1 %arg4, ptr addrspace(1) undef
+ store volatile i1 %arg5, ptr addrspace(1) undef
+ store volatile i1 %arg6, ptr addrspace(1) undef
+ store volatile i1 %arg7, ptr addrspace(1) undef
+
+ store volatile i1 %arg8, ptr addrspace(1) undef
+ store volatile i1 %arg9, ptr addrspace(1) undef
+ store volatile i1 %arg10, ptr addrspace(1) undef
+ store volatile i1 %arg11, ptr addrspace(1) undef
+ store volatile i1 %arg12, ptr addrspace(1) undef
+ store volatile i1 %arg13, ptr addrspace(1) undef
+ store volatile i1 %arg14, ptr addrspace(1) undef
+ store volatile i1 %arg15, ptr addrspace(1) undef
+
+ store volatile i1 %arg16, ptr addrspace(1) undef
+ store volatile i1 %arg17, ptr addrspace(1) undef
+ store volatile i1 %arg18, ptr addrspace(1) undef
+ store volatile i1 %arg19, ptr addrspace(1) undef
+ store volatile i1 %arg20, ptr addrspace(1) undef
+ store volatile i1 %arg21, ptr addrspace(1) undef
+ store volatile i1 %arg22, ptr addrspace(1) undef
+ store volatile i1 %arg23, ptr addrspace(1) undef
+
+ store volatile i1 %arg24, ptr addrspace(1) undef
+ store volatile i1 %arg25, ptr addrspace(1) undef
+ store volatile i1 %arg26, ptr addrspace(1) undef
+ store volatile i1 %arg27, ptr addrspace(1) undef
+ store volatile i1 %arg28, ptr addrspace(1) undef
+ store volatile i1 %arg29, ptr addrspace(1) undef
+ store volatile i1 %arg30, ptr addrspace(1) undef
+ store volatile i1 %arg31, ptr addrspace(1) undef
+
+ ret void
+}
+
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index 7c7a600c4bda2b..f9e717d1424e28 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -22,10 +22,10 @@ define i32 @load_const_i32_gv() {
define i32 @load_select_const_i32_gv(i1 %cond) {
; CHECK-LABEL: name: load_select_const_i32_gv
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[TRUNC]](s1), [[GV]], [[GV1]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index a5482bd5b79a96..c3b8a6b2b75263 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -10,8 +10,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX7-LABEL: v_div_fmas_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX7-NEXT: s_and_b32 s4, 1, s0
+; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -19,8 +19,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX8-LABEL: v_div_fmas_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT: s_and_b32 s4, 1, s0
+; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -28,32 +28,32 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX10_W32-LABEL: v_div_fmas_f32:
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX10_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10_W32-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W32-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_W64-LABEL: v_div_fmas_f32:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX10_W64-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W64-NEXT: s_setpc_b64 s[30:31]
;
; GFX11_W32-LABEL: v_div_fmas_f32:
; GFX11_W32: ; %bb.0:
; GFX11_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11_W32-NEXT: s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX11_W32-NEXT: s_setpc_b64 s[30:31]
;
; GFX11_W64-LABEL: v_div_fmas_f32:
; GFX11_W64: ; %bb.0:
; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX11_W64-NEXT: s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX11_W64-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
@@ -64,8 +64,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX7-LABEL: v_div_fmas_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX7-NEXT: s_and_b32 s4, 1, s0
+; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -73,8 +73,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX8-LABEL: v_div_fmas_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX8-NEXT: s_and_b32 s4, 1, s0
+; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -82,32 +82,32 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX10_W32-LABEL: v_div_fmas_f64:
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10_W32-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10_W32-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_W64-LABEL: v_div_fmas_f64:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX10_W64-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10_W64-NEXT: s_setpc_b64 s[30:31]
;
; GFX11_W32-LABEL: v_div_fmas_f64:
; GFX11_W32: ; %bb.0:
; GFX11_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11_W32-NEXT: s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX11_W32-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX11_W32-NEXT: s_setpc_b64 s[30:31]
;
; GFX11_W64-LABEL: v_div_fmas_f64:
; GFX11_W64: ; %bb.0:
; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX11_W64-NEXT: s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX11_W64-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index da9601a8998c2b..39b4a0992c9b22 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,9 +168,9 @@ define void @localize_internal_globals(i1 %cond) {
; GFX9-LABEL: localize_internal_globals:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT: s_and_b32 s4, 1, s0
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX9-NEXT: s_cbranch_execnz .LBB2_3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 5c40a4ce13e31a..9beec51710598e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -10,11 +10,10 @@ define i32 @select_sdiv_lhs_const_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_lhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x1e848
-; GCN-NEXT: v_mov_b32_e32 v2, 0x30d40
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_mov_b32 s6, 0x30d40
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x1e848
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 5, i32 8
%op = sdiv i32 1000000, %select
@@ -29,11 +28,10 @@ define i32 @select_sdiv_rhs_const_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_rhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x2710
-; GCN-NEXT: v_mov_b32_e32 v2, 0x3e8
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_movk_i32 s6, 0x3e8
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x2710
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 42000, i32 420000
%op = sdiv i32 %select, 42
@@ -48,11 +46,10 @@ define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) {
; GCN-LABEL: select_sdiv_lhs_const_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x22b
-; GCN-NEXT: v_mov_b32_e32 v2, 0x29a
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_movk_i32 s6, 0x29a
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x22b
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, 0x594
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, <2 x i32> <i32 5, i32 undef>, <2 x i32> <i32 6, i32 7>
@@ -68,14 +65,13 @@ define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
; GCN-LABEL: select_sdiv_rhs_const_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3661c
-; GCN-NEXT: v_mov_b32_e32 v2, 0x307dd
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, 0x23b02a
-; GCN-NEXT: v_mov_b32_e32 v2, 0x13e3a0c
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_mov_b32 s6, 0x307dd
+; GCN-NEXT: s_mov_b32 s5, 0x13e3a0c
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x3661c
+; GCN-NEXT: s_cselect_b32 s5, s5, 0x23b02a
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, <2 x i32> <i32 8342123, i32 834212353>, <2 x i32> <i32 9355456, i32 93554321>
%op = sdiv <2 x i32> %select, <i32 42, i32 40>
@@ -126,40 +122,41 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_lhs_opaque_const0_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v1, vcc
-; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: s_cselect_b32 s4, s6, 5
+; GCN-NEXT: s_ashr_i32 s5, s4, 31
+; GCN-NEXT: s_add_i32 s4, s4, s5
+; GCN-NEXT: s_xor_b32 s4, s4, s5
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT: s_sub_i32 s6, 0, s4
+; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT: s_mov_b32 s6, 0xf4240
+; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT: s_mov_b32 s4, 0xf4240
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT: v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NEXT: s_mul_i32 s6, s6, s4
+; GCN-NEXT: s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT: s_sub_i32 s7, s6, s4
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s6, s4
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: s_cselect_b32 s6, s7, s6
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s6, s4
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT: v_subrev_u32_e32 v0, vcc, s5, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
%op = sdiv i32 1000000, %select
@@ -208,40 +205,41 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_lhs_opaque_const1_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 5, vcc
-; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: s_cselect_b32 s4, 5, s6
+; GCN-NEXT: s_ashr_i32 s5, s4, 31
+; GCN-NEXT: s_add_i32 s4, s4, s5
+; GCN-NEXT: s_xor_b32 s4, s4, s5
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT: s_sub_i32 s6, 0, s4
+; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT: s_mov_b32 s6, 0xf4240
+; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT: s_mov_b32 s4, 0xf4240
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT: v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NEXT: s_mul_i32 s6, s6, s4
+; GCN-NEXT: s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT: s_sub_i32 s7, s6, s4
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s6, s4
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: s_cselect_b32 s6, s7, s6
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s6, s4
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT: v_subrev_u32_e32 v0, vcc, s5, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
%op = sdiv i32 1000000, %select
@@ -257,18 +255,15 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_rhs_opaque_const0_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x392fa
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0x30c30c31
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x392fa
+; GCN-NEXT: v_mul_hi_i32 v0, s4, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
@@ -287,18 +282,15 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_rhs_opaque_const1_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0xa410
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0x30c30c31
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT: s_cselect_b32 s4, 0xa410, s6
+; GCN-NEXT: v_mul_hi_i32 v0, s4, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
@@ -316,11 +308,10 @@ define i32 @select_add_lhs_const_i32(i1 %cond) {
; GCN-LABEL: select_add_lhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0xf4248
-; GCN-NEXT: v_mov_b32_e32 v2, 0xf4245
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_mov_b32 s6, 0xf4245
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0xf4248
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 5, i32 8
%op = add i32 1000000, %select
@@ -335,11 +326,9 @@ define float @select_fadd_lhs_const_i32_fmf(i1 %cond) {
; GCN-LABEL: select_fadd_lhs_const_i32_fmf:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x40a00000
-; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: v_mov_b32_e32 v0, 0x40a00000
+; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, float 2.0, float 4.0
%op = fadd nnan nsz float 1.0, %select
@@ -351,12 +340,10 @@ define i32 @select_mul_lhs_const_i32(i1 %cond) {
; GCN-LABEL: select_mul_lhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT: v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_movk_i32 s6, 0x1388
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; IR-LABEL: @select_mul_lhs_const_i32(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
; IR-NEXT: ret i32 [[OP]]
@@ -370,12 +357,10 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
; GCN-LABEL: select_mul_rhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT: v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_movk_i32 s6, 0x1388
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; IR-LABEL: @select_mul_rhs_const_i32(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
; IR-NEXT: ret i32 [[OP]]
@@ -411,9 +396,7 @@ define i16 @select_add_trunc_select(i1 %cond) {
; GCN-LABEL: select_add_trunc_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
; IR-LABEL: @select_add_trunc_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
@@ -432,9 +415,9 @@ define i32 @select_add_sext_select(i1 %cond) {
; GCN-LABEL: select_add_sext_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 29, vcc
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, 29, 50
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i16 -13, i16 8
%trunc = sext i16 %select to i32
@@ -450,9 +433,9 @@ define i32 @select_add_zext_select(i1 %cond) {
; GCN-LABEL: select_add_zext_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, 47, 50
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i16 5, i16 8
%trunc = zext i16 %select to i32
@@ -468,11 +451,10 @@ define i32 @select_add_bitcast_select(i1 %cond) {
; GCN-LABEL: select_add_bitcast_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x4000002a
-; GCN-NEXT: v_mov_b32_e32 v2, 0x3f80002a
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_mov_b32 s6, 0x3f80002a
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x4000002a
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, float 1.0, float 2.0
%trunc = bitcast float %select to i32
@@ -493,10 +475,8 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
; GCN-LABEL: multi_use_cast_regression:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3c00
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_sub_f32_e32 v1, 1.0, v0
; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 01dcc26566663b..9c8259d32dae55 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -8,7 +8,7 @@ define void @void_func_i1(i1 %arg0) #0 {
; CIGFX89-LABEL: void_func_i1:
; CIGFX89: ; %bb.0:
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
; CIGFX89-NEXT: s_mov_b32 s6, -1
; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -18,7 +18,7 @@ define void @void_func_i1(i1 %arg0) #0 {
; GFX11-LABEL: void_func_i1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
@@ -31,6 +31,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
; CIGFX89-LABEL: void_func_i1_zeroext:
; CIGFX89: ; %bb.0:
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CIGFX89-NEXT: v_or_b32_e32 v0, 12, v0
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
; CIGFX89-NEXT: s_mov_b32 s6, -1
@@ -41,9 +42,11 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
; GFX11-LABEL: void_func_i1_zeroext:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 12, v0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ext = zext i1 %arg0 to i32
@@ -56,7 +59,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; CI-LABEL: void_func_i1_signext:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CI-NEXT: v_sub_i32_e32 v0, vcc, 12, v0
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -66,7 +70,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; VI-LABEL: void_func_i1_signext:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT: v_sub_u32_e32 v0, vcc, 12, v0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -76,7 +81,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; GFX9-LABEL: void_func_i1_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_sub_u32_e32 v0, 12, v0
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -86,9 +92,11 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; GFX11-LABEL: void_func_i1_signext:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v0, 12, v0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ext = sext i1 %arg0 to i32
@@ -101,9 +109,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CIGFX89-LABEL: i1_arg_i1_use:
; CIGFX89: ; %bb.0: ; %bb
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0
-; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1
+; CIGFX89-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
; CIGFX89-NEXT: s_cbranch_execz .LBB3_2
; CIGFX89-NEXT: ; %bb.1: ; %bb1
@@ -120,11 +126,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; GFX11-LABEL: i1_arg_i1_use:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT: s_xor_b32 s1, s0, -1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s1
; GFX11-NEXT: s_cbranch_execz .LBB3_2
; GFX11-NEXT: ; %bb.1: ; %bb1
@@ -2775,12 +2779,11 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2789,15 +2792,15 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v20
-; CI-NEXT: v_and_b32_e32 v0, 1, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v18, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v17, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v18, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2814,12 +2817,12 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; VI-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2828,14 +2831,13 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, 1, v20
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v17, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v16, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v18, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v17, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2852,14 +2854,12 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2868,29 +2868,28 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v16, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_i1_i8_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:4
-; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:8
-; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:12
-; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:4
+; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: v_cndmask_b32_e64 v32, 0, 1, s0
+; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -2899,8 +2898,6 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: v_and_b32_e32 v16, 1, v32
; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2909,7 +2906,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc
@@ -4566,4 +4563,276 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
ret void
}
+define void @many_i1_args(
+ i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+ i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+ i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+ i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: many_i1_args:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_saveexec_b64 vcc, -1
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, vcc
+; GFX9-NEXT: v_writelane_b32 v19, s30, 0
+; GFX9-NEXT: v_writelane_b32 v19, s31, 1
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT: s_mov_b32 s31, 0xf000
+; GFX9-NEXT: s_mov_b32 s30, -1
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[10:11]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[12:13]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[14:15]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[16:17]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[18:19]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[20:21]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[22:23]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[24:25]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[26:27]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[28:29]
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v4
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v5
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v7
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v9
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v10
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v11
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v12
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v13
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v14
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v15
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v17
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v19, 1
+; GFX9-NEXT: v_readlane_b32 s30, v19, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: many_i1_args:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT: s_mov_b32 s30, -1
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_mov_b32 s31, 0x31016000
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s6
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s7
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s9
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s11
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s13
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s14
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s15
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s16
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s17
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s18
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s19
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s20
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s21
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s22
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s24
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s25
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s26
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s27
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s28
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s29
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v1, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ store volatile i1 %arg2, ptr addrspace(1) undef
+ store volatile i1 %arg3, ptr addrspace(1) undef
+ store volatile i1 %arg4, ptr addrspace(1) undef
+ store volatile i1 %arg5, ptr addrspace(1) undef
+ store volatile i1 %arg6, ptr addrspace(1) undef
+ store volatile i1 %arg7, ptr addrspace(1) undef
+
+ store volatile i1 %arg8, ptr addrspace(1) undef
+ store volatile i1 %arg9, ptr addrspace(1) undef
+ store volatile i1 %arg10, ptr addrspace(1) undef
+ store volatile i1 %arg11, ptr addrspace(1) undef
+ store volatile i1 %arg12, ptr addrspace(1) undef
+ store volatile i1 %arg13, ptr addrspace(1) undef
+ store volatile i1 %arg14, ptr addrspace(1) undef
+ store volatile i1 %arg15, ptr addrspace(1) undef
+
+ store volatile i1 %arg16, ptr addrspace(1) undef
+ store volatile i1 %arg17, ptr addrspace(1) undef
+ store volatile i1 %arg18, ptr addrspace(1) undef
+ store volatile i1 %arg19, ptr addrspace(1) undef
+ store volatile i1 %arg20, ptr addrspace(1) undef
+ store volatile i1 %arg21, ptr addrspace(1) undef
+ store volatile i1 %arg22, ptr addrspace(1) undef
+ store volatile i1 %arg23, ptr addrspace(1) undef
+
+ store volatile i1 %arg24, ptr addrspace(1) undef
+ store volatile i1 %arg25, ptr addrspace(1) undef
+ store volatile i1 %arg26, ptr addrspace(1) undef
+ store volatile i1 %arg27, ptr addrspace(1) undef
+ store volatile i1 %arg28, ptr addrspace(1) undef
+ store volatile i1 %arg29, ptr addrspace(1) undef
+ store volatile i1 %arg30, ptr addrspace(1) undef
+ store volatile i1 %arg31, ptr addrspace(1) undef
+
+ ret void
+}
+
+
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index e7d86c0c178e97..f725f26ccca146 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -12,6 +12,8 @@ define i1 @i1_func_void() #0 {
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; GFX789-NEXT: s_waitcnt vmcnt(0)
+; GFX789-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX789-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_func_void:
@@ -21,6 +23,9 @@ define i1 @i1_func_void() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i1, ptr addrspace(1) undef
ret i1 %val
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
deleted file mode 100644
index 44af2c90f900b3..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define void @void_func_i1(i1 %arg0) #0 {
-; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
-; Therefore, the "s_mov_b32 s5, 0" is generated.
-;
-; CIGFX89-LABEL: void_func_i1:
-; CIGFX89: ; %bb.0:
-; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT: s_mov_b32 s6, -1
-; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT: s_waitcnt vmcnt(0)
-; CIGFX89-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: void_func_i1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- store i1 %arg0, ptr addrspace(1) undef
- ret void
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
deleted file mode 100644
index f9203cf078e47c..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1(i1) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[8:9]
-; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT: s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_imm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s0, -1
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_endpgm
- call void @external_void_func_i1(i1 true)
- ret void
-}
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
deleted file mode 100644
index 1141476960250a..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1_signext(i1 signext) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_signext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[8:9]
-; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT: s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_signext:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_getpc_b64 s[4:5]
-; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11-NEXT: s_mov_b32 s0, s2
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX11-NEXT: s_endpgm
- %var = load volatile i1, ptr addrspace(1) undef
- call void @external_void_func_i1_signext(i1 signext %var)
- ret void
-}
-
-
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
deleted file mode 100644
index 6bf64da7a1b8ff..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_return.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define i1 @i1_func_void() #0 {
- %val = load i1, ptr addrspace(1) undef
- ret i1 %val
-}
-
-define void @test_call_i1_func_void() #0 {
-; CIGFX89-LABEL: test_call_i1_func_void:
-; CIGFX89: ; %bb.0:
-; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: s_mov_b32 s6, s33
-; CIGFX89-NEXT: s_mov_b32 s33, s32
-; CIGFX89-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
-; CIGFX89-NEXT: s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT: s_addk_i32 s32, 0x400
-; CIGFX89-NEXT: s_getpc_b64 s[4:5]
-; CIGFX89-NEXT: s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
-; CIGFX89-NEXT: s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
-; CIGFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CIGFX89-NEXT: v_writelane_b32 v1, s30, 0
-; CIGFX89-NEXT: v_writelane_b32 v1, s31, 1
-; CIGFX89-NEXT: s_waitcnt lgkmcnt(0)
-; CIGFX89-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CIGFX89-NEXT: global_store_byte v[2:3], v0, off
-; CIGFX89-NEXT: s_waitcnt vmcnt(0)
-; CIGFX89-NEXT: v_readlane_b32 s31, v1, 1
-; CIGFX89-NEXT: v_readlane_b32 s30, v1, 0
-; CIGFX89-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
-; CIGFX89-NEXT: s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT: s_addk_i32 s32, 0xfc00
-; CIGFX89-NEXT: s_mov_b32 s33, s6
-; CIGFX89-NEXT: s_waitcnt vmcnt(0)
-; CIGFX89-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_i1_func_void:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s2, s33
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v1, s30, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v1, s31, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_readlane_b32 s31, v1, 1
-; GFX11-NEXT: v_readlane_b32 s30, v1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: global_store_b8 v[2:3], v0, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: s_mov_b32 s33, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-
- %val = call i1 @i1_func_void()
- store volatile i1 %val, ptr addrspace(1) undef
- ret void
-}
-
-attributes #0 = { nounwind }
-
-
More information about the llvm-commits
mailing list